收集台北市長柯文哲臉書粉專上有關「世大運」一文的網友留言
rm(list = ls(all.names=TRUE))
library(httr)
library(rjson)
library(httpuv)
library(Rfacebook)
##
## Attaching package: 'Rfacebook'
## The following object is masked from 'package:methods':
##
## getGroup
token = "EAACEdEose0cBALSvdT5pTeZC1QekXkBNSDA3rPXoIgdH6qyJEA00MogctjElzNRZBsRjfrC09eaw6W6ZBhZB8ZAD5YluKzOyUfBCpXYeKaGIwjXVVSJO94Tu8VaMZBj5AXgPEV3gmdFmvJuuQSGRBI2ozyT2kCFrVD4miF1hKztlbOoy68l5MsGLZCBdhlBBNdSJOFTCuEIZAwZDZD"
url = sprintf("https://graph.facebook.com/v2.10/136845026417486_1156494751119170?fields=comments&access_token=%s",token)
res = GET(url)
data = content(res)
data1 <- matrix(unlist(data$comments$data))
#只截取留言,不擷取留言者和時間
comments <- data1[seq(4, length(data1), 5), ]
comments <- as.data.frame(comments)
cnt = 1
while(TRUE)
{
if (cnt == 1)
nexturl = unlist(data$comments$paging[2])
else
nexturl = unlist(data$paging[2])
nextres = GET(nexturl)
ndata = content(nextres)
ndata1 <- matrix(unlist(ndata$data))
ncomments <- ndata1[seq(4, length(ndata1), 5), ]
ncomments <- as.data.frame(ncomments)
names(ncomments) = names(comments)
comments <- rbind(comments, ncomments)
data <- ndata
cnt = cnt + 1
if(names(data$paging[2]) == "previous") break
}
#將擷取下來的留言存成txt檔
write.table(comments, file = "comments.txt")
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
library(tm)
library(tmcn)
## # tmcn Version: 0.2-8
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
#單個檔案時的讀取方法
file = readLines("comments.txt")
docs <- Corpus(VectorSource(file))
#多個檔案時的讀取方法
#filenames <- list.files(getwd(), pattern="*.txt")
#files <- lapply(filenames, readLines)
#docs <- Corpus(VectorSource(files))
#清理文本
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "\n")
docs <- tm_map(docs,toSpace, "的")
docs <- tm_map(docs,toSpace, "了")
docs <- tm_map(docs,toSpace, "是")
docs <- tm_map(docs,toSpace, "都")
docs <- tm_map(docs,toSpace, "你")
docs <- tm_map(docs,toSpace, "我")
docs <- tm_map(docs,toSpace, "很")
docs <- tm_map(docs,toSpace, "也")
docs <- tm_map(docs,toSpace, "嗎")
docs <- tm_map(docs,toSpace, "讓")
docs <- tm_map(docs,toSpace, "和")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
#新增字詞以加強斷詞準確度
mixseg = worker()
segment <- c("世大運", "柯p", "柯P", "柯市長", "柯文哲", "台灣人")
new_user_word(mixseg,segment)
## [1] TRUE
#建立詞頻矩陣
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
freqFrame = freqFrame[order(freqFrame$Freq,decreasing=TRUE), ]
#library(knitr)
#kable(head(freqFrame), format = "markdown")
從文字雲的結果可看出,對於世大運,網友對於柯文哲市長給予相當正面的回應。且由「台灣」、「台灣人」等詞可看出,因為世大運為國際賽事,也激起了國人的團結愛國情操。
par(family=("Heiti TC Light")) #mac上的中文字體問題,必須加上這行字才不會變成框框
wordcloud(freqFrame$Var1,freqFrame$Freq,
scale=c(4,1),min.freq=20,max.words=150,
random.order=FALSE, random.color=FALSE,
rot.per=.1, colors=brewer.pal(8, "Dark2"),
ordered.colors=FALSE,use.r.layout=FALSE,
fixed.asp=TRUE)