f1="D:/python/data/pchome.txt"
with open(f1,"r",encoding="utf8") as file:
s1=file.read()
s1=s1.replace("看板NTUcourse標題Re: [評價]","看板NTUcourse標題[評價]")
s1=s1.replace("看板NTUcourse標題[評價]","這個是分隔詞")
textlist=s1.split("這個是分隔詞")
while "" in textlist:
textlist.remove('')
import jieba as jb
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
class wordvec:
def __init__(self,textlist,addlist=[]):
addlist.append("這個是分隔詞窩")
for i in addlist:
jb.add_word(i)
vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False, token_pattern = r'\b\w+\b', min_df = 1)
jbcut=jb.lcut("這個是分隔詞窩".join(textlist))
jbcut=",".join(jbcut).split("這個是分隔詞窩")
tf_mat = vectorizer.fit_transform(jbcut)
self.words = np.array(vectorizer.get_feature_names())
self.farray=np.array(tf_mat.toarray())
self.ntf1=np.ones(len(self.words),dtype=bool)
self.ntf2=np.ones(len(self.words),dtype=bool)
def filter_ratio(self,from_a=0,to_b=0):
if(from_a>=to_b):
return None
else:
narray=np.array(self.farray)
narrayindoc=np.sum(np.heaviside(narray, 0),0 )
narrayratio=narrayindoc/narray.shape[0]
ntfa=from_a<narrayratio
ntfb=narrayratio<=to_b
self.ntf1=ntfa*ntfb
return self.ntf1
def filter_stopword(self,stopword=[],num=True,eng=False):
tfmat=np.ones(len(self.words),dtype=bool)
if(len(stopword)!=0):
for i in range(len(self.words)):
if (self.words[i] in stopword):
tfmat[i]=False
ntf2=tfmat
if(num==True):
tfmat=np.ones(len(self.words),dtype=bool)
for i in range(len(self.words)):
if(48<=ord(self.words[i][0])<=57 or 48<=ord(self.words[i][-1])<=57):
tfmat[i]=False
ntf2=ntf2*tfmat
if(eng==True):
tfmat=np.ones(len(self.words),dtype=bool)
for i in range(len(self.words)):
if(97<=ord(self.words[i][0])<=122 or 65<=ord(self.words[i][0])<=90):
tfmat[i]=False
ntf2=ntf2*tfmat
self.ntf2=ntf2
return ntf2
def tfidf(self):
narray=np.array(self.farray)
narrayindoc=np.sum(np.heaviside(narray, 0),0 )
tfidf_mat=narray*np.log(narray.shape[0]/narrayindoc)
self.tfidf_mat=tfidf_mat
return tfidf_mat
def get_mat(self,tfidf=True,stopword=True,ratio=True,typ="mat"):
tfmat=np.ones(len(self.words),dtype=bool)
mat=self.farray
if(stopword==True):
tfmat=tfmat*self.ntf2
if(ratio==True):
tfmat=tfmat*self.ntf1
if(tfidf==True):
mat=self.tfidf_mat[:,tfmat]
words=self.words[tfmat]
else:
mat=self.farray[:,tfmat]
words=self.words[tfmat]
if(typ=="mat"):
return mat
if(typ=="words"):
return words
class filter_yaxis:
def __init__(self,word,m):
self.word=word
self.m=m
def filter_byword (self,l1,need=True): #l1:取出有l1中的字的文章 ,need=false:會取沒有l1的
final=np.zeros(self.m.shape[0],dtype=bool)
for i in l1:
good1=np.argwhere(self.word==i)
if(good1.shape[0]==0):
break
good1=good1[0][0]
good=(self.m[:,good1]!=0)
final=final+good
if(need==True):
re_mate =self.m[final==True]
else:
re_mate=self.m[final==False]
return re_mate
from sklearn.decomposition import PCA
def PCA_Easy(mat,to_ncomponent=2,by_y=False): #矩陣,降到幾維,對Y降為?
pca=PCA(n_components=to_ncomponent)
if(by_y==False):
reducedata=pca.fit_transform(mat)
if(by_y==True):
reducedata=np.transpose(pca.fit_transform(np.transpose(mat)))
return reducedata
#輸入為兩個矩陣,分別代表兩家的文章
class twomat_oneword:
def __init__(self,m1,m2,word):
self.m1=m1
self.m2=m2
self.word=word
if(m1.shape[1]!=m2.shape[1] or m1.shape[1]!=word.shape[0]):
print("error:size")
def avgm(self,m): #對x向除以sum
s=(np.sum(m,1))**2
m1=m/(s.reshape(s.shape[0],1)+0.000000001)
return m1
def special_word(self,re="wmat"):
P=self.avgm(self.m1) #將詞頻矩陣每列標準化(使每篇文章造成的貢獻相同
N=self.avgm(self.m2)
P1=np.sum(P,0) #分別將兩個矩陣壓成一維在標準化一次(避免兩矩陣文章數不同,詞的量值不對稱
P2=P1/np.sum(P1)
N1=np.sum(N,0) #將兩矩陣相減得一數列,此數列代表對應的詞是偏向哪家網站
N2=N1/np.sum(N1)
jg=P2-N2
if(re=="nmat"):
return jg
elif (re=="wmat"):
return self.word[np.flip(np.argsort(jg),0)]
else:
print("input error")
return None
from wordcloud import WordCloud
w=wordvec(textlist,["紮實",'紮實甜','又甜又涼',"因人而異","看個人","爛貨","申訴"]) #自訂字庫
w.filter_ratio(0.1,0.5) #採用在10% 到 30%的文章出現過的詞
w.filter_stopword(stopword=["一人","以下",'一份', '一半',
'一堂', '一學期', '一本',],eng=True,num=True) #過濾停用字,英文,數字
w.tfidf() #產生tfidf矩陣
len(w.get_mat(typ="words") ) #這些字可能是多篇文章中可以當作共同指標的字(不會在每篇都出現,卻也不會只出現在少數一兩篇文章)
plotlist=[]
for i in np.arange(0,0.4,0.02):
w.filter_ratio(i,i+0.02)
sl=(len(w.get_mat(typ="words")))**0.5
plotlist.append(sl)
import matplotlib.pyplot as plt
plt.bar(list(np.arange(0,0.4,0.02)*100) ,plotlist)
plt.show()
w.filter_stopword(stopword=["一人","以下",'一份', '一半',
'一堂', '一學期', '一本',],eng=True,num=True) #過濾停用字,英文,數字
w.tfidf()
w.filter_ratio(0,0.1) #採用在0% 到 10%的文章出現過的詞
testm=w.get_mat(tfidf=True,typ="mat")[13]
testword=w.get_mat(typ="words")
print(testword[np.flip(np.argsort(testm),0)][1:50])
w.filter_ratio(0.08,0.3) #採用在8% 到 30%的文章出現過的詞
testm=w.get_mat(tfidf=False,typ="mat")[13]
testword=w.get_mat(typ="words")
print(testword[np.flip(np.argsort(testm),0)][1:50])
w.filter_ratio(0,1)
testm=w.get_mat(tfidf=False,typ="mat")
testword=w.get_mat(typ="words")
# 找出和關鍵詞最有關的詞
l2=["優惠","活動"]
testm.shape
np.argwhere(testm==None)
w1=filter_yaxis(testword,testm)
wtempP=w1.filter_byword(l2) #找出含有l2中的詞的文章
wtempN=w1.filter_byword(l2,need=False) #其他的文章
np.argwhere(wtempN==None)
m1=wtempP
m2=wtempN
w3=twomat_oneword(m1,m2,testword) #將包含關鍵詞的文章和不包含的文章做交叉比對
l3=w3.special_word(re="wmat")[0:222]# 找出和關鍵詞最有關的詞
print(l3)
import matplotlib.pyplot as plt
font = 'C:\Windows\Fonts\mingliu.ttc'
wc = WordCloud(background_color="white" , collocations=False, font_path=font, width=1500, height=860, margin=2).generate(" ".join(l3[0:222]))
plt.imshow(wc)
plt.axis("off")
plt.savefig("D:/python/data/get.png")
m=avgm(testm)
jgmat=m*((jg*1000)**5)
jgmat=np.sum(jgmat,1)
np.argsort(jgmat)
f1="D:/python/data/pchome.txt"
with open(f1,"r",encoding="utf8") as file:
s1=file.read()
#s1=s1.replace("看板NTUcourse標題Re: [評價]","看板NTUcourse標題[評價]")
#s1=s1.replace("看板NTUcourse標題[評價]","這個是分隔詞")
pchomelist=s1.split("這個是分隔詞")
while "" in pchomelist:
pchomelist.remove('')
sep_index=len(pchomelist)
f1="D:/python/data/yahoo01.txt"
with open(f1,"r",encoding="utf8") as file:
s1=file.read()
yahoolist=s1.split("這個是分隔詞")
while "" in pchomelist:
yahoolist.remove('')
totallist=pchomelist+yahoolist
vs=wordvec(totallist,["糾紛",'運費','退貨',"划算","看個人","爛貨","申訴",'折價卷','抽獎'])
vs.filter_ratio(0.01,0.7) #採用在1% 到 70%的文章出現過的詞
vs.filter_stopword(stopword=[],eng=False,num=True)
vs.tfidf()
mat=vs.get_mat(tfidf=True,typ="mat")
word=vs.get_mat(tfidf=True,typ="words")
print(mat.shape)
print(sep_index)
mpchome=mat[:sep_index,:]
myahoo=mat[sep_index:,:]
mpchome.shape
l1=["手機","電腦"]
l2=["折價","折","優惠","活動"]
l3=["退貨","申訴"]
l4=["系統"]
l5=["騙"]
l6=["貴","超貴"]
listf=l3
pc=filter_yaxis(word,mpchome)
ya=filter_yaxis(word,myahoo)
w4=twomat_oneword(pc.filter_byword(listf),ya.filter_byword(listf),word)
#w4=twomat_oneword(mpchome,myahoo,word)
print(pc.filter_byword(listf).shape,ya.filter_byword(listf).shape)
compareword=w4.special_word(re="wmat")
print(compareword[0:22])
print(np.flip(compareword,0)[0:22])
font = 'C:\Windows\Fonts\mingliu.ttc'
wc = WordCloud(background_color="white" , collocations=False, font_path=font, width=1500, height=860, margin=2).generate(" ".join(compareword[0:40]))
plt.imshow(wc)
plt.axis("off")
plt.show()
wc = WordCloud(background_color="white" , collocations=False, font_path=font, width=1500, height=860, margin=2).generate(" ".join(np.flip(compareword,0)[0:40]))
plt.imshow(wc)
plt.axis("off")
plt.show()
type(wc)
vs.filter_ratio(0.01,0.5)
testm=vs.get_mat(tfidf=False,typ="mat")
print(testm.shape)
testword=vs.get_mat(tfidf=False,typ="words")
mattemp=PCA_Easy(testm,3,by_y=True)
mattemp
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=9)
clf.fit(np.transpose(mattemp))
#color=clf.fit(mattemp).labels_
color=np.transpose(clf.labels_)
print(color)
reducedata=PCA_Easy(testm,2,by_y=True)
reducedata.shape
import matplotlib.pyplot as plt
colors = ['blue', 'purple', 'yellow',"red","green","black"]
type(reducedata)
for i in range(len(colors)):
# x = reducedata[:, 0][color== i]
#y = reducedata[:, 1][color == i]
x = reducedata[0][color== i]
y = reducedata[1][color == i]
plt.scatter(x, y, c=colors[i])
#plt.legend(dig.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title("PCA Scatter Plot")
plt.show()
for i in range(9):
print(testword[color==i])