資料:ptt e-shopping¶

f1="D:/python/data/pchome.txt"
with open(f1,"r",encoding="utf8") as file:
    s1=file.read()
s1=s1.replace("看板NTUcourse標題Re: [評價]","看板NTUcourse標題[評價]")
s1=s1.replace("看板NTUcourse標題[評價]","這個是分隔詞")
textlist=s1.split("這個是分隔詞")
while "" in textlist:
    textlist.remove('')

import jieba as jb
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
class wordvec:
    def __init__(self,textlist,addlist=[]):
        addlist.append("這個是分隔詞窩")
        for i in addlist:
            jb.add_word(i)
        vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False, token_pattern = r'\b\w+\b', min_df = 1)
        jbcut=jb.lcut("這個是分隔詞窩".join(textlist))
        jbcut=",".join(jbcut).split("這個是分隔詞窩")
        tf_mat = vectorizer.fit_transform(jbcut)
        self.words = np.array(vectorizer.get_feature_names())
        self.farray=np.array(tf_mat.toarray())
        self.ntf1=np.ones(len(self.words),dtype=bool)
        self.ntf2=np.ones(len(self.words),dtype=bool)   
    def filter_ratio(self,from_a=0,to_b=0):
        if(from_a>=to_b):
            return None
        else:
            narray=np.array(self.farray)
            narrayindoc=np.sum(np.heaviside(narray, 0),0 )
            narrayratio=narrayindoc/narray.shape[0]
            ntfa=from_a<narrayratio
            ntfb=narrayratio<=to_b
            self.ntf1=ntfa*ntfb
            return self.ntf1
    def filter_stopword(self,stopword=[],num=True,eng=False):
        tfmat=np.ones(len(self.words),dtype=bool)
        if(len(stopword)!=0):
            
            for i in range(len(self.words)):
                if (self.words[i] in stopword):
                    tfmat[i]=False
        ntf2=tfmat
        
        if(num==True):
            tfmat=np.ones(len(self.words),dtype=bool)
            for i in range(len(self.words)):
                if(48<=ord(self.words[i][0])<=57 or 48<=ord(self.words[i][-1])<=57):
                    tfmat[i]=False
            ntf2=ntf2*tfmat
        if(eng==True):
            tfmat=np.ones(len(self.words),dtype=bool)
            for i in range(len(self.words)):
                if(97<=ord(self.words[i][0])<=122 or 65<=ord(self.words[i][0])<=90):
                    tfmat[i]=False
                ntf2=ntf2*tfmat
        self.ntf2=ntf2
        return ntf2
              
        
    
    def tfidf(self):
        
        narray=np.array(self.farray)
        narrayindoc=np.sum(np.heaviside(narray, 0),0 )
        tfidf_mat=narray*np.log(narray.shape[0]/narrayindoc)
        self.tfidf_mat=tfidf_mat
        return tfidf_mat
    def get_mat(self,tfidf=True,stopword=True,ratio=True,typ="mat"):
        tfmat=np.ones(len(self.words),dtype=bool)
        mat=self.farray
        if(stopword==True):
            tfmat=tfmat*self.ntf2
        if(ratio==True):
            tfmat=tfmat*self.ntf1
        if(tfidf==True):
            mat=self.tfidf_mat[:,tfmat]
            words=self.words[tfmat]
            
        else:
            mat=self.farray[:,tfmat]
            words=self.words[tfmat]
            
        if(typ=="mat"):
            return mat
        if(typ=="words"):
            return words

將詞頻矩陣¶

class filter_yaxis:
    def __init__(self,word,m):
        self.word=word
        self.m=m
    def filter_byword (self,l1,need=True):        #l1:取出有l1中的字的文章 ，need=false:會取沒有l1的  
        final=np.zeros(self.m.shape[0],dtype=bool)
        for i in l1:
            good1=np.argwhere(self.word==i)
            
            if(good1.shape[0]==0):
                break
            good1=good1[0][0]
            
            good=(self.m[:,good1]!=0)
            final=final+good
        if(need==True):
            re_mate =self.m[final==True]
        else:
            re_mate=self.m[final==False]
        return re_mate

from sklearn.decomposition import PCA
def PCA_Easy(mat,to_ncomponent=2,by_y=False):   #矩陣，降到幾維，對Y降為?
    pca=PCA(n_components=to_ncomponent)
    if(by_y==False):
        reducedata=pca.fit_transform(mat)
    if(by_y==True):
        reducedata=np.transpose(pca.fit_transform(np.transpose(mat)))
    return reducedata

算法講解:為了得出兩家購物網站的評價差別，因此使用有別於Kmeans+PLA,非監督算法的方法¶

#輸入為兩個矩陣，分別代表兩家的文章
class twomat_oneword:                    
    def __init__(self,m1,m2,word):
        self.m1=m1
        self.m2=m2
        self.word=word
        if(m1.shape[1]!=m2.shape[1] or m1.shape[1]!=word.shape[0]):
            print("error:size")
    def avgm(self,m):                 #對x向除以sum
        s=(np.sum(m,1))**2
        m1=m/(s.reshape(s.shape[0],1)+0.000000001)
        return m1
    def special_word(self,re="wmat"):
        
        P=self.avgm(self.m1)                   #將詞頻矩陣每列標準化(使每篇文章造成的貢獻相同
        N=self.avgm(self.m2)
        P1=np.sum(P,0)                         #分別將兩個矩陣壓成一維在標準化一次(避免兩矩陣文章數不同，詞的量值不對稱
        P2=P1/np.sum(P1)
        N1=np.sum(N,0)                         #將兩矩陣相減得一數列，此數列代表對應的詞是偏向哪家網站
        N2=N1/np.sum(N1)
        jg=P2-N2
        if(re=="nmat"):
            return jg
        
        
        elif (re=="wmat"):
            return self.word[np.flip(np.argsort(jg),0)]
        else:
            print("input error")
            return None

from wordcloud import WordCloud

w=wordvec(textlist,["紮實",'紮實甜','又甜又涼',"因人而異","看個人","爛貨","申訴"]) #自訂字庫

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\gordon\AppData\Local\Temp\jieba.cache
Loading model cost 0.906 seconds.
Prefix dict has been built succesfully.

w.filter_ratio(0.1,0.5)  #採用在10% 到 30%的文章出現過的詞
w.filter_stopword(stopword=["一人","以下",'一份', '一半', 
                            '一堂', '一學期', '一本',],eng=True,num=True) #過濾停用字，英文，數字
w.tfidf()   #產生tfidf矩陣

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 5.95277285, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

len(w.get_mat(typ="words") ) #這些字可能是多篇文章中可以當作共同指標的字(不會在每篇都出現，卻也不會只出現在少數一兩篇文章)

271

plotlist=[]
for i in np.arange(0,0.4,0.02):
    w.filter_ratio(i,i+0.02) 
    sl=(len(w.get_mat(typ="words")))**0.5
    plotlist.append(sl)

這張圖顯示大部分的詞都只出現在很少的文章中¶

x 軸是所有文章中出現某詞的百分比,Y軸是詞的數量¶

import matplotlib.pyplot as plt 
plt.bar(list(np.arange(0,0.4,0.02)*100) ,plotlist) 
plt.show()

w.filter_stopword(stopword=["一人","以下",'一份', '一半', 
                            '一堂', '一學期', '一本',],eng=True,num=True) #過濾停用字，英文，數字
w.tfidf()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 5.95277285, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

比較兩種篩選法一種篩出主題一種篩出相關評價¶

w.filter_ratio(0,0.1)  #採用在0% 到 10%的文章出現過的詞
testm=w.get_mat(tfidf=True,typ="mat")[13]
testword=w.get_mat(typ="words")

print(testword[np.flip(np.argsort(testm),0)][1:50])
w.filter_ratio(0.08,0.3)  #採用在8% 到 30%的文章出現過的詞
testm=w.get_mat(tfidf=False,typ="mat")[13]
testword=w.get_mat(typ="words")
print(testword[np.flip(np.argsort(testm),0)][1:50])

['特價' '影響' '金' '銀行' '補足' '想退' '方案' '繁雜' '要購' '之後買' '用限時' '發文如' '貨換' '退貨換'
 '有觸' '金購物' '犯板規' '週末限時' '價錢會' '為限時' '那用' '瑕疵' '原來' '自訂' '看上' '剩餘' '碰'
 '申退' '號購' '新機' '額度' '金活動' '價格' '以上' '此版' '如' '售價' '限量' '還請' '我想' '玉山'
 '解惑' '噢' '上述' '原價' '正確' '退掉' '等等' '教']
['刷卡' '各位' '購買' '之' '第一次' '申請' '原來' '價格' '以上' '退' '目前' '使用' '遇到' '元' '啦'
 '其他' '哦' '什麼' '送' '然' '一台' '版友' '不能' '新' '手機' '只是' '確認' '不到' '麻煩' '發現'
 '叫' '和' '回' '廠商' '囉' '單' '喔' '商店' '呢' '只要' '告知' '名稱' '建議' '可能' '得' '回應'
 '只有' '很多' '可是']

w.filter_ratio(0,1)
testm=w.get_mat(tfidf=False,typ="mat")
testword=w.get_mat(typ="words")

另一種篩選方式關鍵詞篩選¶

# 找出和關鍵詞最有關的詞
l2=["優惠","活動"]
testm.shape
np.argwhere(testm==None)

array([], shape=(0, 2), dtype=int64)

w1=filter_yaxis(testword,testm)
wtempP=w1.filter_byword(l2)   #找出含有l2中的詞的文章
wtempN=w1.filter_byword(l2,need=False)   #其他的文章
np.argwhere(wtempN==None)

array([], shape=(0, 2), dtype=int64)

m1=wtempP
m2=wtempN
w3=twomat_oneword(m1,m2,testword)  #將包含關鍵詞的文章和不包含的文章做交叉比對
l3=w3.special_word(re="wmat")[0:222]# 找出和關鍵詞最有關的詞
print(l3)

['活動' '買' '抽' '送' '分享' '儲值' '了' '情報' '好像' '網址' '竟然' '優惠' '特價' '文章' '在' '看'
 '的' '最近' '購買' '刷卡' '有人' '就' '一下' '滿' '金' '便宜' '經驗' '搶' '功能' '時間' '元現'
 '什麼' '金積點' '欸' '啊' '第一次' '去' '看到' '注意' '現在' '更' '結果' '之' '現金積點' '自' '這個'
 '感謝' '周邊' '再' '約' '來' '回函' '枕' '四千多' '提起' '購退貨' '立馬辦' '無故' '下架' '結束' '千多'
 '小' '玉山' '美' '可以' '掃地' '機器' '還有' '變成' '寶貝' '手機' '隻' '嘍' '會自' '報給' '終於體'
 '一覺' '好康道' '有興趣' '幾篇' '威猛' '同樣' '原本' '睡' '相' '不妥' '剛好' '送到' '消費' '又' '囉'
 '名' '進行' '網頁' '日' '明' '時候' '萬購' '物金' '這些' '查看' '前' '鍵盤' '包含' '麻煩' '滑鼠'
 '額' '訊息' '雙人' '滑雪' '套票' '似乎' '不限金額' '價格' '閒聊' '快' '翻購' '多相' '手氣' '優' '呦'
 '買紀' '週年' '關經驗' '剛逛' '粉絲頁' '還去' '限時' '惠' '日本' '快去' '到' '觀察' '不錯' '任何'
 '不到' '其他' '版上' '藍罐' '康是' '一送' '多' '雙' '老公' '告知' '之前' '飆速' '消費滿' '如題' '完'
 '限量' '超' '雖然' '二' '時' '起' '按摩' '蠻' '註' '朋友' '還在' '試試' '沒' '隔天' '啥' '錄'
 '安全性' '驟驗證' '先儲值' '簡訊驗證' '開啟' '關掉' '將手' '金時' '已關' '剛嘗試' '但剛' '閉此' '須經'
 '兩步' '跟上' '標題' '實業坊' '發信' '批' '踢踢' '企業' '近日' '連' '某些' '問是' '五萬' '有名'
 '需登錄' '頂多' '沒算' '先打' '機驗證' '萬' '限定' '台新' '版友' '須' '可行' '步驟' '並且' '一萬' '站'
 '共' '感恩' '驗證' '還沒' '關閉' '單筆' '衛生' '購物' '台灣' '大概' '優惠券' '促銷' '本身' '馬']

import matplotlib.pyplot as plt
font = '‪C:\Windows\Fonts\mingliu.ttc'
wc = WordCloud(background_color="white" , collocations=False, font_path=font, width=1500, height=860, margin=2).generate(" ".join(l3[0:222]))

plt.imshow(wc)
plt.axis("off")

plt.savefig("D:/python/data/get.png")

m=avgm(testm)
jgmat=m*((jg*1000)**5)
jgmat=np.sum(jgmat,1)
np.argsort(jgmat)

array([139,   3,  86,  24, 151, 152, 138, 113, 172, 148, 174,  49,  26,
       125,  31, 109, 124, 157,  84,   4, 137,  94,  63,  78,   6,  13,
        58, 165,   2, 119,  72, 176,  75, 167,  99,  88,   5, 146,  61,
       163,  76,  64,  85, 171,  54, 143,  97,  82,  41,  83,  95, 161,
        74, 126,  34, 122,  48, 154, 105, 132,  87,  46,  28,  81,  59,
        12,  42,   1,  30, 133,  98, 159, 169,  25,  52,  23,   7,  44,
        77, 141, 134, 123, 121,  21,  80,  57,  68,  17,  10, 115, 144,
        62,  55, 173,  14,  50, 147, 164, 162,   0, 114, 117,  45, 168,
       112,  15,  51,   8,  19, 170, 155,  20,  79,  32,  96,  93, 149,
        56, 118,  43,  67, 150,  37,  18,  38,  66, 156, 128, 158, 130,
       100,  35,  90, 177, 108,  53, 110, 107, 160, 116,  40, 106, 135,
        91,  47, 131,  16,  39, 102, 104, 103, 120, 129,  27,  29,  71,
       101,   9,  92,  65, 142,  22, 127,  73,  33,  36, 166,  60,  89,
       136,  70,  11, 111, 153, 145,  69, 140, 175], dtype=int64)

比較兩購物網站，在不同方面的評價¶

f1="D:/python/data/pchome.txt"
with open(f1,"r",encoding="utf8") as file:
    s1=file.read()
#s1=s1.replace("看板NTUcourse標題Re: [評價]","看板NTUcourse標題[評價]")
#s1=s1.replace("看板NTUcourse標題[評價]","這個是分隔詞")
pchomelist=s1.split("這個是分隔詞")
while "" in pchomelist:
    pchomelist.remove('')
sep_index=len(pchomelist)
f1="D:/python/data/yahoo01.txt"
with open(f1,"r",encoding="utf8") as file:
    s1=file.read()
yahoolist=s1.split("這個是分隔詞")
while "" in pchomelist:
    yahoolist.remove('')
totallist=pchomelist+yahoolist

vs=wordvec(totallist,["糾紛",'運費','退貨',"划算","看個人","爛貨","申訴",'折價卷','抽獎'])

vs.filter_ratio(0.01,0.7)  #採用在1% 到 70%的文章出現過的詞
vs.filter_stopword(stopword=[],eng=False,num=True)
vs.tfidf()
mat=vs.get_mat(tfidf=True,typ="mat")
word=vs.get_mat(tfidf=True,typ="words")

print(mat.shape)
print(sep_index)
mpchome=mat[:sep_index,:]
myahoo=mat[sep_index:,:]
mpchome.shape

(544, 2232)
267

(267, 2232)

l1=["手機","電腦"]
l2=["折價","折","優惠","活動"]
l3=["退貨","申訴"]
l4=["系統"]
l5=["騙"]
l6=["貴","超貴"]

listf=l3

pc=filter_yaxis(word,mpchome)
ya=filter_yaxis(word,myahoo)

w4=twomat_oneword(pc.filter_byword(listf),ya.filter_byword(listf),word)
#w4=twomat_oneword(mpchome,myahoo,word)
print(pc.filter_byword(listf).shape,ya.filter_byword(listf).shape)
compareword=w4.special_word(re="wmat")
print(compareword[0:22])
print(np.flip(compareword,0)[0:22])

(117, 2232) (61, 2232)
['pchome' 'PCHOME' '退款' 'ibon' 'Mar' '設定' '原廠' '儲值' '一台' '贈品' '請問' '筆電'
 '費用' '成功' '個' '金' '廠商' '收回' '付款' '同意' '經驗' '逾']
['中心' 'yahoo' '刷' 'YAHOO' '一支' '退掉' '購物' '外盒' '換貨' '其' '圖' 'Yahoo' '按鈕'
 '鞋子' '尺寸' 'ppt' '拍賣' 'From' '小心' '弄' '商城' '資格']

pchome :退貨相關¶

font = '‪C:\Windows\Fonts\mingliu.ttc'
wc = WordCloud(background_color="white" , collocations=False, font_path=font, width=1500, height=860, margin=2).generate(" ".join(compareword[0:40]))
plt.imshow(wc)
plt.axis("off")
plt.show()

yahoo :退貨相關¶

wc = WordCloud(background_color="white" , collocations=False, font_path=font, width=1500, height=860, margin=2).generate(" ".join(np.flip(compareword,0)[0:40]))

plt.imshow(wc)
plt.axis("off")
plt.show()

type(wc)

wordcloud.wordcloud.WordCloud

vs.filter_ratio(0.01,0.5)
testm=vs.get_mat(tfidf=False,typ="mat")
print(testm.shape)
testword=vs.get_mat(tfidf=False,typ="words")
mattemp=PCA_Easy(testm,3,by_y=True)
mattemp

(544, 2218)

array([[-3.25202283, -2.9889831 , -3.4812188 , ..., -1.81921407,
        -2.96838579, -2.96520205],
       [ 0.05879249,  0.06492966,  0.05671331, ...,  0.50071135,
         0.20508892, -0.04714739],
       [ 0.39646059,  0.61830549,  0.55278282, ...,  1.63718818,
         0.92539982,  0.28038131]])

嘗試使用 PLA KMeams 分群¶

from sklearn.cluster import KMeans
clf = KMeans(n_clusters=9)
clf.fit(np.transpose(mattemp))
#color=clf.fit(mattemp).labels_
color=np.transpose(clf.labels_)
print(color)

[0 0 0 ... 0 0 0]

reducedata=PCA_Easy(testm,2,by_y=True)
reducedata.shape

(2, 2218)

import matplotlib.pyplot as plt
colors = ['blue', 'purple', 'yellow',"red","green","black"]
type(reducedata)
for i in range(len(colors)):
   # x = reducedata[:, 0][color== i]
    #y = reducedata[:, 1][color == i]
    x = reducedata[0][color== i]
    y = reducedata[1][color == i]
    plt.scatter(x, y, c=colors[i])
#plt.legend(dig.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title("PCA Scatter Plot")
plt.show()

for i in range(9):
    
    print(testword[color==i])

['APP' 'ATM' 'Android' ... '點進' '點選' '１']
['pekosan' '上' '不是' '人家' '他' '出貨' '去' '取消' '吧' '商品' '因為' '如果' '妳' '寫' '就是'
 '得' '態度' '所以' '才' '標示' '清楚' '為' '看' '看到' '真的' '編輯' '自己' '規則' '說明' '賣'
 '賣場' '這樣']
['你']
['QQ' 'XD' 'com' 'http' 'imgur' 'pchome' 'po' '一個' '一定' '一樣' '一直' '不會'
 '不能' '不要' '之前' '什麼' '今天' '他們' '但是' '個' '元' '再' '原' '又' '只' '只是' '只有' '可能'
 '啦' '喔' '多' '大' '完全' '對' '幫' '平台' '廠商' '怎麼' '想' '應該' '打' '找' '抱怨' '拿到'
 '收到' '東西' '根本' '比' '沒有' '洗衣' '生氣' '用' '直接' '知道' '等' '而且' '耶' '被' '覺得' '話'
 '誰' '請' '讓' '購物' '較' '退' '退貨' '送' '這' '這種' '這麼' '還是' '那' '錢']
['蝦皮']
['PCHOME' 'PO' 'Y' '_' 'i' 'jpg' 'somehow' 'tinabjqs' 'yahoo' 'yakuhime'
 '一' '一下' '一堆' '一次' '一般' '一點' '上面' '下' '下單' '下標' '不到' '不同' '不好' '不想' '不然'
 '不爽' '不用' '不管' '不過' '中' '中心' '事' '事情' '介面' '付款' '代表' '以' '使用' '便宜' '們'
 '做' '像' '元免' '先' '免運' '公司' '其他' '其實' '凹' '出' '分享' '分開' '別' '到底' '前' '包'
 '包包' '包裝' '卻' '原因' '另外' '只能' '叫' '可' '可是' '合理' '呢' '呵呵' '和' '品質' '哦' '哪裡'
 '商店' '喜歡' '單' '單子' '嘛' '回' '回應' '回覆' '圖片' '地方' '垃圾' '塊' '多少' '大家' '太'
 '奇怪' '她' '字' '它' '完' '客人' '家' '寄' '實體' '對方' '小' '小時' '尤其' '就算' '居然' '差'
 '已' '已經' '希望' '幾' '店家' '店面' '廠' '很多' '從' '您' '情況' '想到' '感覺' '懂' '成本' '我們'
 '我覺' '或' '扯' '把' '折' '折扣' '拍' '拍拍' '拿' '按照' '掉' '接受' '推文' '收' '改' '方式'
 '於' '明' '明明' '昨天' '是不是' '時候' '更' '最' '最近' '月' '有些' '有人' '有點' '朋友' '本來'
 '標' '機' '欸' '正常' '每個' '比較' '沒錯' '注意' '活動' '流程' '浪費' '消費者' '滿' '無法' '現在'
 '理由' '理解' '瑕疵' '發現' '發票' '皮' '看清楚' '真' '碼' '符合' '系統' '結帳' '結果' '經驗' '網購'
 '網頁' '罵' '而' '而已' '能' '與' '若' '處理' '蝦' '蠻' '製' '要是' '要求' '規定' '訂單' '設定'
 '註' '評價' '該' '認為' '請問' '謝謝' '貨' '買家' '買過' '貼' '資訊' '賺' '購買' '超商' '退款'
 '這個' '這家' '這是' '這篇' '通知' '遇到' '運' '運費' '運送' '還' '還有' '還要' '那個' '那麼' '重點'
 '錯' '開始' '阿' '限制' '隻' '難' '難道' '電話' '需要' '面' '頁' '騙']
['人' '啊' '噓' '客服' '很' '跟']
['賣家']
['PC' 'pc']

資料:ptt e-shopping¶

將詞頻矩陣¶

算法講解:為了得出兩家購物網站的評價差別，因此使用有別於Kmeans+PLA,非監督算法的方法¶

這張圖顯示大部分的詞都只出現在很少的文章中¶

x 軸是所有文章中出現某詞的百分比,Y軸是詞的數量¶

比較兩種篩選法 一種篩出主題 一種篩出相關評價¶

另一種篩選方式 關鍵詞篩選¶

比較兩購物網站，在不同方面的評價¶

pchome :退貨相關¶

yahoo :退貨相關¶

嘗試使用 PLA KMeams 分群¶

比較兩種篩選法一種篩出主題一種篩出相關評價¶

另一種篩選方式關鍵詞篩選¶