library(rvest)
## Loading required package: xml2
web1='https://www.google.com.tw/search?q='
webname="%E4%BD%A0%E7%9A%84%E5%90%8D%E5%AD%97"
#"2018%E5%A4%8F%E7%95%AAptt"
#"%E9%AB%94%E8%82%B2%E7%94%9C"(體育甜)
#'%e7%8e%8b%e5%82%91%e8%b3%a2'#要搜尋的(此處為:蔡爾成)
#'%E6%88%91%E7%9A%84%E8%8B%B1%E9%9B%84%E5%AD%B8%E9%99%A2'(王傑賢)
web2='+ptt+site:www.ptt.cc&rlz=1C1GCEA_enTW762TW762&ei=njSjW5PDMIj08gWx8Je4DA&start='
webnum=''
webfinal='&sa=N&biw=1280&bih=653'
website=c()
long=3 # google 頁數
for(i in 0:long){
webnum=as.character(i*10)
web=paste(web1,webname,web2,webnum,webfinal,sep="")
doc1 <- read_html(web)
block=html_nodes(doc1,".r")
text1=html_text(block)
for (j in 1:10){
website[j+i*10]=xml_attrs(xml_child(block[[j]], 1))[["href"]]#獲取網址
}
}
print("down")
## [1] "down"
a=website #診裡網址
a=strsplit(a,'&')
for (i in 1:(10*long+10)){
a[[i]]=a[[i]][1]
a[[i]]=strsplit(a[[i]],'=')
}
for (i in 1:(10*long+10)){
a[[i]]=a[[i]][[1]][2]
}
website=a
#a=0
head(website)#整理完畢
## [[1]]
## [1] "https://www.ptt.cc/bbs/movie/M.1477462873.A.3E6.html"
##
## [[2]]
## [1] "https://www.ptt.cc/bbs/shinkai/M.1508934062.A.A16.html"
##
## [[3]]
## [1] "https://www.ptt.cc/bbs/C_Chat/M.1476690397.A.79D.html"
##
## [[4]]
## [1] "https://www.ptt.cc/bbs/movie/M.1476965126.A.FB5.html"
##
## [[5]]
## [1] "https://www.ptt.cc/bbs/movie/M.1477068122.A.736.html"
##
## [[6]]
## [1] "https://www.ptt.cc/bbs/movie/M.1491597833.A.E5C.html"
#進入PTT
txtlist=list()
for (i in 1:(long*10)){
doc1=read_html(website[[i]])
txt=html_text(doc1)
#txt=strsplit(txt,'NTUcourse標題')#整理文章10篇
txtlist[i] <- txt
}
setwd('D:/R/R data/TextMining')
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
docs <- Corpus(VectorSource(txtlist))
toSpace <- content_transformer(function(x, pattern) {
return (gsub(pattern, " ", x))
}
)
docs <- tm_map(docs, toSpace, "※")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "※"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "看板")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "看板"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "作者")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "作者"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "發信站")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "發信站"): transformation
## drops documents
docs <- tm_map(docs, toSpace, "批踢踢實業坊")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "批踢踢實業坊"):
## transformation drops documents
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "[a-zA-Z]"): transformation
## drops documents
pop_list=c("作品",'視覺',"什麼","可以","這部","因為","所以","不過","其實","應該","如果","覺得","不是","沒有","這樣","還是","一樣","不會","只是","可能"
,"時候","可能","知道","時候","最後","就是","真的")# 用迴圈過濾廢字
for (i in 1:length(pop_list)){
docs <- tm_map(docs, toSpace, pop_list[i])
}
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents
#導演的名字,片名
#docs <- tm_map(docs,removeWords,stopwordsCN())#移除常用廢字
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
mixseg = worker()
new_user_word(mixseg, "新海誠","你的名字")
## [1] TRUE
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
frevar1=as.character(freqFrame$Var1)
frevar=nchar(frevar1)>1
freqFrame=freqFrame[frevar,]
freqFrame=freqFrame[rev(order(freqFrame$Freq)),]
#freqFrame
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(subset(freqFrame,Freq>121),aes(x=reorder(Var1,-Freq),y=Freq))+geom_bar(stat="identity")#依照詞頻率繪製柱狀圖
wordcloud(freqFrame$Var1,freqFrame$Freq,
scale=c(4,0.1),min.freq=6,max.words=140,
random.order=TRUE, random.color=FALSE,
rot.per=.1, colors=brewer.pal(8, "Dark2"),
ordered.colors=FALSE,use.r.layout=FALSE,
fixed.asp=TRUE)
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.