Python 감성 분석 및 감성사전 구축.

2018. 12. 27. 18:36

먼저 나이브 베이즈 코드의 경우 님의 블로그에서 가져왔습니다.

https://ratsgo.github.io/machine%20learning/2017/05/18/naive/

배틀그라운드 스팀 리뷰를 통한 감성 사전 구축 및 감성 분석.

배틀그라운드의 리뷰에 대해 감성사전이란 주제로 프로젝트를 진행한 내용입니다.
텍스트 마이닝의 공부 목적으로 진행되어, 잘못된 코드나 분석이 있음을 수 있습니다.
크롤링을 통해 데이터를 직접 수집하였고, 토픽 모델링, 감성분석, 감성사전, 나이브 베이즈를 활용 하였습니다.
코드 설명은 주석으로 대체합니다.
모델에 대한 코드만 업로드 하였고, 깃허브에서 전체 코드를 보실 수 있습니다.
https://github.com/MOONJOOYOUNG/DataScience/tree/master/Sentiment%20Analysis

# 사용 라이브러리.

import nltk

import re

import collections

import itertools

import lda

import requests

import csv

import time

import math

import operator

import numpy as np

import pandas as pd

from collections import defaultdict

from pandas import read_table

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords

# 데이터 전처리 과정

# data csv 파일 가져오기.

data = pd.read_csv(r'E:/unstruct/merge.csv')

# df.dataframe 저장용 리스트 만들기.

data_list = []

# 리스트에 csv 파일 한줄씩 가져오기.

for i in data['0']:

data_list.append(str(i))

print(len(data_list))

# 위에 리스트 파일 DataFrame으로 저장.

df = pd.DataFrame(data_list,columns=['review'])

df.to_csv(r'E:\unstruct\test.csv',columns=['review'])

data_list = pd.read_csv(r'E:/unstruct/merge.csv')

data_list = data_list['review']

data_list[0:10]

# 영어 숫자 특수기호 남기기

pre_list = []

for i in data_list:

i = str(i)

text = re.sub('[^a-zA-Z0-9]',' ',i).strip()

text = re.sub('[,]','',text)

text = re.sub(' ','',text)

if(text != ''):

if(text[0] !='?'):

pre_list.append(text)

# 파일 저장

df = pd.DataFrame(pre_list,columns=['review'])

df.to_csv(r'E:\unstruct\Merge_data.csv',columns=['review'])

# 불필요 어구 제거.

nnp_list = []

for i in pre_list:

origin_words = nltk.word_tokenize(i)

data_pos = nltk.pos_tag(origin_words)

words_nnp = [word for word,pos in data_pos if pos in ['NN','NNP','VBG','JJ','JJS','JJR','RB','RBS','RBR']]

words_nnp = [w for w in words_nnp if not w in stopwords.words('english')]

nnp_list.append(words_nnp)

# 불필요 제거 어구 리스트 1차원 감소.

nnp_list_1d = list(itertools.chain.from_iterable(nnp_list))

nnp_list_1d[0:20]

# Stopwords 지정.

stop_words = ['game', 'time', 'server', 'getting', 'pubg', 'next', 'this', 'gamei', 'get', 'please', 'battlegrounds', 'stuff', 'region', 'playerunknown', 'bluehole', 'chinese', 'chinaregion', 'ng', 'engine', 'got', 'im', 't', 'xd', 'ram', 'tho', 'asian', 'but', 'busyservers', 'crash', 'no', 'of', 'so', 'me', 'don', 'too', 'for', 'you', 'my', 'gamethe', 'na', 'd', 'devs', 'ur', 'can', 'graphic', 'pc', 'the', 'able', 'a', 'negative', 'due', 's', 'steam', 'regionlockchina', 'stupid', 'complaining', 'in', 'it', 'china', 'computer', 'lag', 'problem', 'gameit', 'are', 'poor', 'itit', 'lack', 'hacking', 'iti', 'anti', 'and', 'gtx', 'to', 'is', 'br', 'ive', 'terrible', 'gear', 'wrong','pubg','lot','access','pc','thing','something','bluehole','im','battleground','battlegrounds','regionlockchina','playerunknown','is','issue','the','dont','lot','gtx','bluehole','alot','end','access','devs','fix','hacker','to','hek','br','beta','hate','my','too','im','one','someone','issue','please','development','na','every','trash','be','br','it','log','im', 'iti', 'pc', 'this', 'to', 'pubg', 'a', 'review', 'lot', 'access', 'year', 'guy', 'steam', 'playerunknown', 'bluehole', 'thing', 'china', 'one', 's', 'na', 'you', 'e', 'don', 'too', 'for', 'cheater', 'battlegrounds', 'gtx', 'dont', 'hardware', 'on', 'devs', 'bugs', 'blue', 'n', 'ban', 'can', 'eu', 'cheating', 'l', 'dude', 'h', 'v', 'kinda', 'your', 'i7', 'would', 'ping', 'be', 'xd', 'ton', 'of', 'f', 'cant', 'but', 'af', 'garbage', 'trash', 'hole', 'its', 'till', 'cheat', 'alpha', 'developer', 'test', 'win', 'gon', 'gameif', 'cpu', 'numba', 'pace', 'choice', 'pls', 'line', 'laggy', 'cod', 'r', 't', 'p', 'minute', 'network', 'ng', 'cs', 'regionlockchina', 'dev', 'man', 'the', 'in', 'me', 'g', 'k', 'gameyou', 'bp', 'gameit', 'is', 'u', 'my', 'are', 'br', 'it', 'hek', 'gamei', 'number', 'alot', 'ram', 'bit', 'just', 'chinaregion', 'd', 'buggy', 'com', 'i5', 'none', 'doot', 'issue', 'hacker', 'hate', 'gamethe', 'gamebut', 'so', 'busyservers', 'bad', 'gpu', 'all', 'rig', 'desync', 'z', 'chinese', 'list', 'let', 'hey', 'games', 'isnt', 'value', 'okay', 'cons', 'do', 'with', 'crap', 'specs', 'no','sometimes', 'instead' ,'yet','also','ok','far','ever','nothing','bug','still','hard','very','always','way','much','lock','however','even','many','away','already','frustration','not','otherwise','little','seriously','reviews','difficult','tl','gay','never','often','gameits','gamein','critical','therethe','lots','everywhere', 'similar','useless','error','early','last']

stop_words_list = np.unique(stop_words)

stop_words_list = stop_words_list.tolist()

# !,? 제거하기.

words_list = []

for i in nnp_list_1d:

# if(i!='?' and i!='!'):

text = str(i)

i = text.lower()

if(i not in stop_words_list):

words_list.append(i)

print(words_list[0:10])

len(words_list)

# 토픽 모델링 과정.

# WordCount

word_count = collections.Counter(words_list)

result = word_count.most_common(1000)

print(len(word_count))

print(result)

# Countervoctorzier & LAD

c_vetorizer = CountVectorizer(analyzer='word')

count = c_vetorizer.fit_transform(words_list)

model = lda.LDA(n_topics = 8, n_iter = 1000, random_state = 1)

model.fit(count)

# LDA result

topic_vocab = c_vetorizer.get_feature_names()

topic_word = model.topic_word_

n_top_word = 100

dist = []

for i, topic_dist in enumerate(topic_word):

dist.append(topic_dist)

topic_words = np.array(topic_vocab)[np.argsort(topic_dist)][:-n_top_word:-1]

print('Topic', i+1, topic_words)

# 나이브 베이지 과정

# 나이브 베이지안 모델

class NaiveBayesClassifier:

def __init__(self, k=0.5):

self.k = k

self.word_probs = []

def count_words(self, training_set):

# 학습데이터는 게임리뷰 본문(doc), 라벨(label)으로 구성

# 나이브 베이지안 stopwords = 1이상의 값을 가지는 단어와 불필요 상위 단어.

stop_words = 'game','pubg','play','playing','time','gameplay','very','going','review','lul','experience','u','point','long','reason','pc','highly','community','everything','current','steam','too','this','next','playerunknown','in','all','pan','soon','anything','open','even','still','far','much','early','much','really','lot','ever','new','access','many','t','gmae','first','way','also','way','free','actually','back','someone','dont','something','nothing','LUL','instead','the','thing','a','bluehole','trying','already','almost','not','team','high','amount','away','able','and','always','everyone','year','to','day','guy','is','you','i','maybe','else','na','s','don','so','it','im','cant'

counts = defaultdict(lambda : [0, 0])

for doc, label in training_set:

# 영화리뷰가 text일 때만 카운트

if self.isNumber(doc) is False:

# 리뷰를 띄어쓰기 단위로 토크나이징

words = doc.split()

# 토픽 모델링과 같은 전처리.

data_pos = nltk.pos_tag(words)

words_nnp = [word for word,pos in data_pos if pos in ['NN','NNP','VBG','JJ','JJS','JJR','RB','RBS','RBR']]

words = [w for w in words_nnp if not w in stopwords.words('english')]

for word in words:

text = str(word)

i = text.lower()

if(i not in stop_words):

# 라벨이 1이면 0값 지정

counts[i][0 if label == 1 else 1] += 1

return counts

# 예외 처리

def isNumber(self, s):

try:

float(s)

return True

except ValueError:

return False

# 모델 결과 반환.

def word_probabilities(self, counts, total_class0, total_class1, k):

# 단어의 빈도수를 [단어, p(w|긍정), p(w|부정)] 형태로 반환

return [(w,

(class0 + k) / (total_class0 + 2*k),

(class1 + k) / (total_class1 + 2*k))

for w, (class0, class1) in counts.items()]

def class0_probability(self, word_probs, doc):

# input 띄어쓰기 처리

docwords = doc.split()

# 초기값은 모두 0으로 처리

log_prob_if_class0 = log_prob_if_class1 = 0.0

for word, prob_if_class0, prob_if_class1 in word_probs:

# 만약 리뷰에 word가 나타나면 해당 단어가 나올 log 확률을 더해 줌

if word in docwords:

log_prob_if_class0 += math.log(prob_if_class0)

log_prob_if_class1 += math.log(prob_if_class1)

# 만약 리뷰에 word가 나타나지 않는다면 해당 단어가 나오지 않을 log 확률을 더해 줌.

# 나오지 않을 확률은 log(1-나올 확률)로 계산

else:

log_prob_if_class0 += math.log(1.0 - prob_if_class0)

log_prob_if_class1 += math.log(1.0 - prob_if_class1)

prob_if_class0 = math.exp(log_prob_if_class0)

prob_if_class1 = math.exp(log_prob_if_class1)

return prob_if_class0 / (prob_if_class0 + prob_if_class1)

# 모델 학습

def train(self, corpus):

training_set = corpus

# calss0 = 긍정리뷰 수 / class1 = 부정리뷰 수

num_class0 = 95000

num_class1 = 95000

# train

word_counts = self.count_words(training_set)

self.word_probs = self.word_probabilities(word_counts, num_class0, num_class1, self.k)

# 모델 테스트

def classify(self, doc):

return self.class0_probability(self.word_probs, doc)

# 긍정 리뷰 불러오기

trainfile_path=r'E:/unstruct/Merge_data_po.csv'

corpus = read_table(trainfile_path, sep=',', encoding='utf-8')

train_po = corpus[0:95000]

corpus_po = np.array(train_po)

# 부정 리뷰 불러오기

trainfile_path=r'E:/unstruct/Merge_data_ne.csv'

corpus = read_table(trainfile_path, sep=',', encoding='utf-8')

train_ne = corpus[0:95000]

corpus_ne = np.array(train_ne)

# 긍정 부정 리뷰 병합 베이지안 모델 input

nb_data = np.concatenate((corpus_po, corpus_ne))

# 모델 학습

model = NaiveBayesClassifier()

model.train(nb_data)

# 긍정 단어 가중치로 딕셔너리 만들기

dict_p = {}

for i in model.word_probs:

dict_p[i[0]] = i[1]

# 부정 단어 가중치로 딕셔너리 만들기

dict_n = {}

for i in model.word_probs:

dict_n[i[0]] = i[2]

# 각 딕셔너리 값을 내림차순 정렬

sort_dict_p = sorted(dict_p.items(), key=operator.itemgetter(1), reverse=True)

sort_dict_n = sorted(dict_n.items(), key=operator.itemgetter(1), reverse=True)

# 각 리뷰별 상위 단어 추출

for i in sort_dict_p[0:30]:

print(i[0])

for i in sort_dict_n[0:30]:

print(i[0])

# 나이브 베이지안 모델 테스트

a = 'Australian servers are here The Good Great graphicsPvp combat is responsive and balancedVariety of weapons attatchments armour and health pickups vehicles tooDevs that listen to the community feedbackThe Bad Typical bugs and glitches you d expect from an early access gamee g menu freezing after game The target area is a bit annoying it s hard to focus on killing when you re too busy trying to make it into the target area and not automatically killed within the time limitHalf of the time i recieve reward points they arent even credited to my accountPurchased the first crate for 700 coins and did not even receive my items Recommendations Maybe introduce other modes andor smaller areas on the mapE g Close quarters All in all this game has great potential and is off to a great startI m excited to see what the devs have in store for us'

model.classify(a)

#감성 사전 구축 과정

# 긍정 토픽

ps_comment = ['good','fun','great','really','better','best','pretty','amazing','awesome','awsome','early','well','goodgame','gg','fantastic','enjoyable','wow','favourite','happy','love','exciting','favorite','hilarious','adrenaline','interesting','excellent']

ps_charctor = ['battleroyale','competetive','real','realistic','addictive','hardcore','strategic','military','faster','pvp','massive']

ps_style = ['battle','fps','royale','gameplay','combat', 'random', 'war','hide','wearing','eating','hunting','hitting','hiding','loot','shooter','aim','aiming','survival','running','picking','kill','killing']

ps_feature = ['sniping','shot','map','bike','squad','inventory','dinner','chicken','scope','box','biggest','miltiple','customization','weapon','winner','clothing','fpp','circle','rating','zone']

ps_graphic = ['air','art','detail','night', 'weather', 'graphics', 'graphic', 'character' ]

ps_othergame = ['battlefield', 'dayz' ,'h1z1', 'h1', 'overwatch', 'csgo', 'cs', 'arma', 'fortnite']

ps_another = ['twitch', 'wadu', 'youtube','streamer' ]

# 부정 토픽

ne_comment = ['bad', 'worst', 'waste', 'unreal', 'problem', 'trash', 'trerrible', 'wrong', 'stupid', 'crash', 'ridiculous', 'garbage', 'refund', 'serious', 'anymore', 'never', 'sad', 'bye', 'boring', 'horrible' ]

ne_envoir = ['server','bug','lag','laggy','optimization','unplayable','matchmaking', 'waiting' ]

ne_another = ['chinese','region','regionlock','lock','regionlockchina','china','cheat','cheating','hack','dev','development','fixing']

# 토픽 모델링 단어에 대해 나이브베이지안 확률값 매핑

ps_comment_dic = {}

for topic in ps_comment: