NLTK를 이용한 Frequency Distributions, Conditional Frequency Distributions, Stopwords

  • Frequency Distributions
  • Conditional Frequency Distributions
  • Stopwords



import nltk

from nltk.corpus import stopwords


# CNN 기사 내용중 일부.

# https://edition.cnn.com/2018/11/07/politics/matthew-whitaker-attorney-general-mueller-special-counsel-probe/index.html


sent = 'The man taking over the Justice Department following Jeff Sessions firing as attorney general has argued that special counsel Robert Muellers investigation went too far. Matthew Whitaker, who was Sessions chief of staff, is expected to take over oversight of Muellers investigation into Russian interference in the 2016 election and whether Trump campaign associates colluded with Russia.'


# 워드 토큰 생성.

origin_words = nltk.word_tokenize(sent)


# Stopwords 지워주기

words = [w for w in origin_words if not w in stopwords.words('english')]


# Frequency Distributions

fd = nltk.FreqDist(words)


print("origin words count : ",len(origin_words))

print("After Stopword : ",len(words))

print("items >> ", fd.items())

print("keys >> ", fd.keys())

print("values >> ", fd.values())


fd.plot()



# 단어와 카테고리를 지정.

import nltk

from nltk.corpus import reuters


# 장르 및 단어 설정.

genres_list = ['alum','cpu','ship','rand','tea']

words_list = ['smelting','BANK','CAPACITY','PORTS','Brooke']

# 해당 카테고리, 단어 추출.

gwlist = [(genre,word) for genre in reuters.categories() if genre in genres_list for word in reuters.words(categories=genre) if word in words_list]


cfd = nltk.ConditionalFreqDist(gwlist)

cfd.tabulate()

cfd.plot()



+ Recent posts