cloud

先用google爬有關“你的名字”的PTT文章

library(rvest)

## Loading required package: xml2

web1='https://www.google.com.tw/search?q='
webname="%E4%BD%A0%E7%9A%84%E5%90%8D%E5%AD%97"
  #"2018%E5%A4%8F%E7%95%AAptt"
  #"%E9%AB%94%E8%82%B2%E7%94%9C"(體育甜)
  #'%e7%8e%8b%e5%82%91%e8%b3%a2'#要搜尋的(此處為:蔡爾成)
#'%E6%88%91%E7%9A%84%E8%8B%B1%E9%9B%84%E5%AD%B8%E9%99%A2'(王傑賢)
web2='+ptt+site:www.ptt.cc&rlz=1C1GCEA_enTW762TW762&ei=njSjW5PDMIj08gWx8Je4DA&start='
webnum=''
webfinal='&sa=N&biw=1280&bih=653'
website=c()
long=3  # google 頁數
for(i in 0:long){
  webnum=as.character(i*10)
  web=paste(web1,webname,web2,webnum,webfinal,sep="")
  doc1 <- read_html(web)
  block=html_nodes(doc1,".r")
  text1=html_text(block)
  for (j in 1:10){
    website[j+i*10]=xml_attrs(xml_child(block[[j]], 1))[["href"]]#獲取網址
  }
  
}
print("down")

## [1] "down"

a=website  #診裡網址
a=strsplit(a,'&')

for (i in 1:(10*long+10)){
  a[[i]]=a[[i]][1]
  a[[i]]=strsplit(a[[i]],'=')
}
for (i in 1:(10*long+10)){
  a[[i]]=a[[i]][[1]][2]
}
website=a
#a=0
head(website)#整理完畢

## [[1]]
## [1] "https://www.ptt.cc/bbs/movie/M.1477462873.A.3E6.html"
## 
## [[2]]
## [1] "https://www.ptt.cc/bbs/shinkai/M.1508934062.A.A16.html"
## 
## [[3]]
## [1] "https://www.ptt.cc/bbs/C_Chat/M.1476690397.A.79D.html"
## 
## [[4]]
## [1] "https://www.ptt.cc/bbs/movie/M.1476965126.A.FB5.html"
## 
## [[5]]
## [1] "https://www.ptt.cc/bbs/movie/M.1477068122.A.736.html"
## 
## [[6]]
## [1] "https://www.ptt.cc/bbs/movie/M.1491597833.A.E5C.html"

#進入PTT

txtlist=list()
for (i in 1:(long*10)){
  doc1=read_html(website[[i]])
  
  txt=html_text(doc1)
  #txt=strsplit(txt,'NTUcourse標題')#整理文章10篇
  txtlist[i] <- txt
  
}

將收集到的資料繪製成文字圖

setwd('D:/R/R data/TextMining')
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)


docs <- Corpus(VectorSource(txtlist))

toSpace <- content_transformer(function(x, pattern) {
  return (gsub(pattern, " ", x))
}
)


docs <- tm_map(docs, toSpace, "※")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "※"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "看板")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "看板"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "作者")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "作者"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "發信站")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "發信站"): transformation
## drops documents

docs <- tm_map(docs, toSpace, "批踢踢實業坊")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "批踢踢實業坊"):
## transformation drops documents

docs <- tm_map(docs, toSpace, "[a-zA-Z]")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "[a-zA-Z]"): transformation
## drops documents

pop_list=c("作品",'視覺',"什麼","可以","這部","因為","所以","不過","其實","應該","如果","覺得","不是","沒有","這樣","還是","一樣","不會","只是","可能"
,"時候","可能","知道","時候","最後","就是","真的")# 用迴圈過濾廢字
for (i in 1:length(pop_list)){
  docs <- tm_map(docs, toSpace, pop_list[i])
}

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(docs, toSpace, pop_list[i]): transformation
## drops documents

  #導演的名字,片名
#docs <- tm_map(docs,removeWords,stopwordsCN())#移除常用廢字
docs <- tm_map(docs, removePunctuation)

## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents

docs <- tm_map(docs, removeNumbers)

## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents

docs <- tm_map(docs, stripWhitespace)

## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents

mixseg = worker()
new_user_word(mixseg, "新海誠","你的名字")

## [1] TRUE

jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))

frevar1=as.character(freqFrame$Var1)
frevar=nchar(frevar1)>1
freqFrame=freqFrame[frevar,]
freqFrame=freqFrame[rev(order(freqFrame$Freq)),]
#freqFrame
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

ggplot(subset(freqFrame,Freq>121),aes(x=reorder(Var1,-Freq),y=Freq))+geom_bar(stat="identity")#依照詞頻率繪製柱狀圖

wordcloud(freqFrame$Var1,freqFrame$Freq,
          scale=c(4,0.1),min.freq=6,max.words=140,
          random.order=TRUE, random.color=FALSE, 
          rot.per=.1, colors=brewer.pal(8, "Dark2"),
          ordered.colors=FALSE,use.r.layout=FALSE,
          fixed.asp=TRUE)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

cloud

gordon

2018年10月4日

先用google爬有關“你的名字”的PTT文章

將收集到的資料繪製成文字圖