Difference between revisions of "Python: NLTK stopwords"
		
		
		
		
		
		Jump to navigation
		Jump to search
		
				
		
		
	
Onnowpurbo (talk | contribs)  (Created page with "Directory   ~/nltk_data/corpora/stopwords")  | 
				Onnowpurbo (talk | contribs)   | 
				||
| (6 intermediate revisions by the same user not shown) | |||
| Line 2: | Line 2: | ||
  ~/nltk_data/corpora/stopwords  |   ~/nltk_data/corpora/stopwords  | ||
| + | |||
| + | |||
| + | Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik  | ||
| + | |||
| + |  import os,nltk,os.path,re,string  | ||
| + |  import argparse  | ||
| + |  from nltk.stem.porter import PorterStemmer  | ||
| + | |||
| + |  ps=PorterStemmer()  | ||
| + | |||
| + |  def parse_args():  | ||
| + |      parser = argparse.ArgumentParser()  | ||
| + |      parser.add_argument('-i', '--infile', default='', help='input filename')  | ||
| + |      return parser.parse_args()  | ||
| + | |||
| + |  args = parse_args()  | ||
| + |  infile = args.infile  | ||
| + | |||
| + |  filename = open(infile,'r')  | ||
| + |  fcontent=filename.read()  | ||
| + |  filename.close()  | ||
| + | |||
| + |  fs = fcontent.split()  | ||
| + |  wordlist=[]  | ||
| + | |||
| + |  for word in fs:  | ||
| + |      word = ps.stem(word.strip(string.punctuation).lower())  | ||
| + |      if word not in nltk.corpus.stopwords.words('english') and len(word)<15:  | ||
| + |         if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:  | ||
| + |             if word not in wordlist:  | ||
| + |                 wordlist.append(word)  | ||
| + |                 print( word )  | ||
| + |             else:  | ||
| + |                 pass  | ||
| + |         else:  | ||
| + |             pass  | ||
| + | |||
| + | |||
| + | Masukan kata2 yang tidak ingin ada dalam text ke dalam file  | ||
| + | |||
| + |  ~/nltk_data/corpora/stopwords/indonesia  | ||
| + | |||
| + | contoh  | ||
| + | |||
| + |  saya  | ||
| + |  punya  | ||
| + |  sendiri  | ||
| + |  kami  | ||
| + |  kamu  | ||
| + |  anda  | ||
| + |  dia  | ||
| + |  mereka  | ||
| + |  jika  | ||
| + |  yang  | ||
| + |  itu  | ||
| + |  siapa  | ||
| + |  dengan  | ||
| + |  a  | ||
| + |  b  | ||
| + |  c  | ||
| + |  d  | ||
| + |  e  | ||
| + |  f  | ||
| + |  ..  | ||
| + |  ..  | ||
| + |  1  | ||
| + |  2  | ||
| + |  3  | ||
| + |  4  | ||
| + |  5  | ||
| + |  ..  | ||
| + |  ..  | ||
| + |  01/1/2017  | ||
| + |  02/1/2017  | ||
| + |  03/1/2017  | ||
| + |  04/1/2017  | ||
| + |  05/1/2017  | ||
| + |  ..  | ||
| + |  ..  | ||
| + |  00:00  | ||
| + |  00:01  | ||
| + |  00:02  | ||
| + |  00:03  | ||
| + |  00:04  | ||
| + | |||
| + | |||
| + | ==Jika sudah ada stopword==  | ||
| + | |||
| + | misalnya,  | ||
| + | |||
| + |  rm ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  touch ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  cat indonesia-common >> ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  cat indonesia-1common >> ~/nltk_data/corpora/stopwords/indonesia  | ||
| + |  cat indonesia-tambahan >> ~/nltk_data/corpora/stopwords/indonesia  | ||
Latest revision as of 05:25, 5 February 2017
Directory
~/nltk_data/corpora/stopwords
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik
import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', default=, help='input filename')
    return parser.parse_args()
args = parse_args()
infile = args.infile
filename = open(infile,'r')
fcontent=filename.read()
filename.close()
fs = fcontent.split()
wordlist=[]
for word in fs:
    word = ps.stem(word.strip(string.punctuation).lower())
    if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
       if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:
           if word not in wordlist:
               wordlist.append(word)
               print( word )
           else:
               pass
       else:
           pass
Masukan kata2 yang tidak ingin ada dalam text ke dalam file
~/nltk_data/corpora/stopwords/indonesia
contoh
saya punya sendiri kami kamu anda dia mereka jika yang itu siapa dengan a b c d e f .. .. 1 2 3 4 5 .. .. 01/1/2017 02/1/2017 03/1/2017 04/1/2017 05/1/2017 .. .. 00:00 00:01 00:02 00:03 00:04
Jika sudah ada stopword
misalnya,
rm ~/nltk_data/corpora/stopwords/indonesia touch ~/nltk_data/corpora/stopwords/indonesia cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-common >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-1common >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-tambahan >> ~/nltk_data/corpora/stopwords/indonesia