<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://onnocenter.or.id/wiki/index.php?action=history&amp;feed=atom&amp;title=Python%3A_NLTK_cleaning_text</id>
	<title>Python: NLTK cleaning text - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://onnocenter.or.id/wiki/index.php?action=history&amp;feed=atom&amp;title=Python%3A_NLTK_cleaning_text"/>
	<link rel="alternate" type="text/html" href="https://onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;action=history"/>
	<updated>2026-05-03T20:31:26Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.35.4</generator>
	<entry>
		<id>https://onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46924&amp;oldid=prev</id>
		<title>Onnowpurbo at 08:49, 5 February 2017</title>
		<link rel="alternate" type="text/html" href="https://onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46924&amp;oldid=prev"/>
		<updated>2017-02-05T08:49:59Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;a href=&quot;https://onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;amp;diff=46924&amp;amp;oldid=46793&quot;&gt;Show changes&lt;/a&gt;</summary>
		<author><name>Onnowpurbo</name></author>
	</entry>
	<entry>
		<id>https://onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46793&amp;oldid=prev</id>
		<title>Onnowpurbo: Created page with &quot; Cleaning Text      01 May 2016 Python Data Wrangling   Create some raw text  # Create a list of three strings. incoming_reports = [&quot;We are attacking on their left flank but a...&quot;</title>
		<link rel="alternate" type="text/html" href="https://onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46793&amp;oldid=prev"/>
		<updated>2017-01-29T08:40:27Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot; Cleaning Text      01 May 2016 Python Data Wrangling   Create some raw text  # Create a list of three strings. incoming_reports = [&amp;quot;We are attacking on their left flank but a...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;&lt;br /&gt;
Cleaning Text&lt;br /&gt;
&lt;br /&gt;
    01 May 2016 Python Data Wrangling &lt;br /&gt;
&lt;br /&gt;
Create some raw text&lt;br /&gt;
&lt;br /&gt;
# Create a list of three strings.&lt;br /&gt;
incoming_reports = [&amp;quot;We are attacking on their left flank but are losing many men.&amp;quot;, &lt;br /&gt;
               &amp;quot;We cannot see the enemy army. Nothing else to report.&amp;quot;, &lt;br /&gt;
               &amp;quot;We are ready to attack but are waiting for your orders.&amp;quot;]&lt;br /&gt;
&lt;br /&gt;
Seperate by word&lt;br /&gt;
&lt;br /&gt;
# import word tokenizer&lt;br /&gt;
from nltk.tokenize import word_tokenize&lt;br /&gt;
&lt;br /&gt;
# Apply word_tokenize to each element of the list called incoming_reports&lt;br /&gt;
tokenized_reports = [word_tokenize(report) for report in incoming_reports]&lt;br /&gt;
&lt;br /&gt;
# View tokenized_reports&lt;br /&gt;
tokenized_reports&lt;br /&gt;
&lt;br /&gt;
[['We',&lt;br /&gt;
  'are',&lt;br /&gt;
  'attacking',&lt;br /&gt;
  'on',&lt;br /&gt;
  'their',&lt;br /&gt;
  'left',&lt;br /&gt;
  'flank',&lt;br /&gt;
  'but',&lt;br /&gt;
  'are',&lt;br /&gt;
  'losing',&lt;br /&gt;
  'many',&lt;br /&gt;
  'men',&lt;br /&gt;
  '.'],&lt;br /&gt;
 ['We',&lt;br /&gt;
  'can',&lt;br /&gt;
  'not',&lt;br /&gt;
  'see',&lt;br /&gt;
  'the',&lt;br /&gt;
  'enemy',&lt;br /&gt;
  'army',&lt;br /&gt;
  '.',&lt;br /&gt;
  'Nothing',&lt;br /&gt;
  'else',&lt;br /&gt;
  'to',&lt;br /&gt;
  'report',&lt;br /&gt;
  '.'],&lt;br /&gt;
 ['We',&lt;br /&gt;
  'are',&lt;br /&gt;
  'ready',&lt;br /&gt;
  'to',&lt;br /&gt;
  'attack',&lt;br /&gt;
  'but',&lt;br /&gt;
  'are',&lt;br /&gt;
  'waiting',&lt;br /&gt;
  'for',&lt;br /&gt;
  'your',&lt;br /&gt;
  'orders',&lt;br /&gt;
  '.']]&lt;br /&gt;
&lt;br /&gt;
# Import regex&lt;br /&gt;
import re&lt;br /&gt;
&lt;br /&gt;
# Import string&lt;br /&gt;
import string&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_punctuation = []&lt;br /&gt;
&lt;br /&gt;
for review in tokenized_reports:&lt;br /&gt;
&lt;br /&gt;
    new_review = []&lt;br /&gt;
    for token in review: &lt;br /&gt;
        new_token = regex.sub(u'', token)&lt;br /&gt;
        if not new_token == u'':&lt;br /&gt;
            new_review.append(new_token)&lt;br /&gt;
&lt;br /&gt;
    tokenized_reports_no_punctuation.append(new_review)&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_punctuation&lt;br /&gt;
&lt;br /&gt;
[['We',&lt;br /&gt;
  'are',&lt;br /&gt;
  'attacking',&lt;br /&gt;
  'on',&lt;br /&gt;
  'their',&lt;br /&gt;
  'left',&lt;br /&gt;
  'flank',&lt;br /&gt;
  'but',&lt;br /&gt;
  'are',&lt;br /&gt;
  'losing',&lt;br /&gt;
  'many',&lt;br /&gt;
  'men'],&lt;br /&gt;
 ['We',&lt;br /&gt;
  'can',&lt;br /&gt;
  'not',&lt;br /&gt;
  'see',&lt;br /&gt;
  'the',&lt;br /&gt;
  'enemy',&lt;br /&gt;
  'army',&lt;br /&gt;
  'Nothing',&lt;br /&gt;
  'else',&lt;br /&gt;
  'to',&lt;br /&gt;
  'report'],&lt;br /&gt;
 ['We',&lt;br /&gt;
  'are',&lt;br /&gt;
  'ready',&lt;br /&gt;
  'to',&lt;br /&gt;
  'attack',&lt;br /&gt;
  'but',&lt;br /&gt;
  'are',&lt;br /&gt;
  'waiting',&lt;br /&gt;
  'for',&lt;br /&gt;
  'your',&lt;br /&gt;
  'orders']]&lt;br /&gt;
&lt;br /&gt;
Remove filler words&lt;br /&gt;
&lt;br /&gt;
from nltk.corpus import stopwords&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_stopwords = []&lt;br /&gt;
for report in tokenized_reports_no_punctuation:&lt;br /&gt;
    new_term_vector = []&lt;br /&gt;
    for word in report:&lt;br /&gt;
        if not word in stopwords.words('english'):&lt;br /&gt;
            new_term_vector.append(word)&lt;br /&gt;
    tokenized_reports_no_stopwords.append(new_term_vector)&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_stopwords&lt;br /&gt;
&lt;br /&gt;
[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],&lt;br /&gt;
 ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],&lt;br /&gt;
 ['We', 'ready', 'attack', 'waiting', 'orders']]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
==Referensi==&lt;br /&gt;
&lt;br /&gt;
* http://chrisalbon.com/python/cleaning_text.html&lt;/div&gt;</summary>
		<author><name>Onnowpurbo</name></author>
	</entry>
</feed>