匯入library

library(httr)
library(rjson)
library(httpuv)
library(Rfacebook, warn.conflicts = FALSE)
library(plyr)
library(NLP, warn.conflicts = FALSE)
library(tm)
library(xml2)
library(rvest, warn.conflicts = FALSE)
library(SnowballC)
library(slam)
library(Matrix)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

在facebook做文本的抓取與字詞清理 找到一些特別的用語並將其加入

token <- "EAACEdEose0cBAFOQiLAJBtv5cPEPH8q04LEXSai354iYbY8YZCNZBryxXZB38JLrmZChtxMRhjIcgnKr9YaSuTKjW9xBnuRtVnnseBjJukgToLHXzI538b7NPWihF85pEPck6OJjZBYCOuIKSc1FfWf480y3hw5SWkQGmoh5tEurjstLxFmlWkjd34lfFDR9PPS19RBE4ri9I7Jvx7FNR" 
page.id <- "1431308506880067" 
page <- getPage(page.id, token, n = 150)
## 25 posts 50 posts 75 posts 100 posts 108 posts
docs = Corpus(VectorSource(as.character(page[,3])))
toSpace = content_transformer(function(x,pattern){
  return (gsub(pattern," ",x))
})
docs <- tm_map(docs, toSpace, "※")
docs <- tm_map(docs, toSpace, "◆")
docs <- tm_map(docs, toSpace, "‧")
docs <- tm_map(docs, toSpace, "的")
docs <- tm_map(docs, toSpace, "我")
docs <- tm_map(docs, toSpace, "也")
docs <- tm_map(docs, toSpace, "他")
docs <- tm_map(docs, toSpace, "是")
docs <- tm_map(docs, toSpace, "就")
docs <- tm_map(docs, toSpace, "你")
docs <- tm_map(docs, toSpace, "啊")
docs <- tm_map(docs, toSpace, "嗎")
docs <- tm_map(docs, toSpace, "啦")
docs <- tm_map(docs, toSpace, "要")
docs <- tm_map(docs, toSpace, "有")
docs <- tm_map(docs, toSpace, "及")
docs <- tm_map(docs, toSpace, "了")
docs <- tm_map(docs, toSpace, "在")
docs <- tm_map(docs, toSpace, "但")
docs <- tm_map(docs, toSpace, "都")
docs <- tm_map(docs, toSpace, "哈")
docs <- tm_map(docs, toSpace, "不")
docs <- tm_map(docs, toSpace, "與")
docs <- tm_map(docs, toSpace, "什麼")
docs <- tm_map(docs, toSpace, "一個")
docs <- tm_map(docs, toSpace, "們")
docs <- tm_map(docs, toSpace, "這")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
mixseg = worker()
segment = c("米絲肉雞")
new_user_word(mixseg,segment)
## [1] TRUE

做出相關性的指標 因為是一個講述歷史的粉絲專業所以以歷史為關鍵詞

jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))

d.corpus <- Corpus(VectorSource(seg))
tdm <- TermDocumentMatrix(d.corpus, 
                          control = list(wordLengths = c(1, Inf)))

labor = findAssocs(tdm, "歷史", 0.6)
labor
## $歷史
##     之盟     平生       扔     先覺     自知     求和     後方     個國 
##     0.80     0.80     0.80     0.80     0.80     0.80     0.80     0.80 
##   高中生     高歌       問       接     教室 喪權辱國     朝代     歲幣 
##     0.80     0.80     0.80     0.80     0.80     0.80     0.80     0.80 
##     腦勺       嘛       夢       熬     戰勝     澶淵     雙手     老師 
##     0.80     0.80     0.80     0.80     0.80     0.80     0.80     0.65 
##       談 
##     0.63

作tf-id 統計圖表

N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{ 
  log2( N / nnzero(word_doc) ) 
}
idf <- apply(tdm, 1, idfCal)


doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
  for(y in 1:ncol(tdm))
  {
    doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
  }
}
topID = lapply(rownames(as.data.frame(labor)), function(x) 
  which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
        x = as.numeric(colnames(doc.tfidf)),
        y = doc.tfidf[topID[24],], 
        name = rownames(doc.tfidf)[topID[24]],
        type = "scatter", mode= "box") %>%
  add_trace(y = doc.tfidf[topID[25],],
            name = rownames(doc.tfidf)[topID[25]])

# 尋找文章之間的相關性 用COS.sim

nonzero = (doc.tfidf != rep(0,10))
nonzeroid = which(row_sums(nonzero) != 0)
q <- rownames(doc.tfidf[nonzeroid,])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]

cos.sim <- function(x, y)
{ 
  (as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y)) 
}

doc.cos <- apply(s.tdm[,1:100], 2, cos.sim,
                 y=as.matrix(s.tdm[,100]))
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
        x = rownames(as.data.frame(orderDoc)),
        y = orderDoc, 
        name = rownames(doc.tfidf)[topID[1]],
        type = "bar", mode= "box")

最後做K-means的圖表

set.seed(150)
kmeansOut <- kmeans(doc.tfidf, 2, nstart = 50)
plot(doc.tfidf, col =(kmeansOut$cluster +1) , main = "k-means result", pch=18, cex=2)

testtfidf <- doc.tfidf
tfidf.pca = prcomp(testtfidf)
biplot(tfidf.pca,color=c(1,11))