匯入library

library(httr)
library(rjson)
library(httpuv)
library(Rfacebook, warn.conflicts = FALSE)
library(plyr)
library(NLP, warn.conflicts = FALSE)
library(tm)
library(xml2)
library(rvest, warn.conflicts = FALSE)
library(SnowballC)
library(slam)
library(Matrix)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:httr':
## 
##     config

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

在facebook做文本的抓取與字詞清理找到一些特別的用語並將其加入

token <- "EAACEdEose0cBAFOQiLAJBtv5cPEPH8q04LEXSai354iYbY8YZCNZBryxXZB38JLrmZChtxMRhjIcgnKr9YaSuTKjW9xBnuRtVnnseBjJukgToLHXzI538b7NPWihF85pEPck6OJjZBYCOuIKSc1FfWf480y3hw5SWkQGmoh5tEurjstLxFmlWkjd34lfFDR9PPS19RBE4ri9I7Jvx7FNR" 
page.id <- "1431308506880067" 
page <- getPage(page.id, token, n = 150)

## 25 posts 50 posts 75 posts 100 posts 108 posts

docs = Corpus(VectorSource(as.character(page[,3])))
toSpace = content_transformer(function(x,pattern){
  return (gsub(pattern," ",x))
})
docs <- tm_map(docs, toSpace, "※")
docs <- tm_map(docs, toSpace, "◆")
docs <- tm_map(docs, toSpace, "‧")
docs <- tm_map(docs, toSpace, "的")
docs <- tm_map(docs, toSpace, "我")
docs <- tm_map(docs, toSpace, "也")
docs <- tm_map(docs, toSpace, "他")
docs <- tm_map(docs, toSpace, "是")
docs <- tm_map(docs, toSpace, "就")
docs <- tm_map(docs, toSpace, "你")
docs <- tm_map(docs, toSpace, "啊")
docs <- tm_map(docs, toSpace, "嗎")
docs <- tm_map(docs, toSpace, "啦")
docs <- tm_map(docs, toSpace, "要")
docs <- tm_map(docs, toSpace, "有")
docs <- tm_map(docs, toSpace, "及")
docs <- tm_map(docs, toSpace, "了")
docs <- tm_map(docs, toSpace, "在")
docs <- tm_map(docs, toSpace, "但")
docs <- tm_map(docs, toSpace, "都")
docs <- tm_map(docs, toSpace, "哈")
docs <- tm_map(docs, toSpace, "不")
docs <- tm_map(docs, toSpace, "與")
docs <- tm_map(docs, toSpace, "什麼")
docs <- tm_map(docs, toSpace, "一個")
docs <- tm_map(docs, toSpace, "們")
docs <- tm_map(docs, toSpace, "這")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
mixseg = worker()
segment = c("米絲肉雞")
new_user_word(mixseg,segment)

## [1] TRUE

做出相關性的指標因為是一個講述歷史的粉絲專業所以以歷史為關鍵詞

jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))

d.corpus <- Corpus(VectorSource(seg))
tdm <- TermDocumentMatrix(d.corpus, 
                          control = list(wordLengths = c(1, Inf)))

labor = findAssocs(tdm, "歷史", 0.6)
labor

## $歷史
##     之盟     平生       扔     先覺     自知     求和     後方     個國 
##     0.80     0.80     0.80     0.80     0.80     0.80     0.80     0.80 
##   高中生     高歌       問       接     教室 喪權辱國     朝代     歲幣 
##     0.80     0.80     0.80     0.80     0.80     0.80     0.80     0.80 
##     腦勺       嘛       夢       熬     戰勝     澶淵     雙手     老師 
##     0.80     0.80     0.80     0.80     0.80     0.80     0.80     0.65 
##       談 
##     0.63

作tf-id 統計圖表

N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{ 
  log2( N / nnzero(word_doc) ) 
}
idf <- apply(tdm, 1, idfCal)


doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
  for(y in 1:ncol(tdm))
  {
    doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
  }
}
topID = lapply(rownames(as.data.frame(labor)), function(x) 
  which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
        x = as.numeric(colnames(doc.tfidf)),
        y = doc.tfidf[topID[24],], 
        name = rownames(doc.tfidf)[topID[24]],
        type = "scatter", mode= "box") %>%
  add_trace(y = doc.tfidf[topID[25],],
            name = rownames(doc.tfidf)[topID[25]])

# 尋找文章之間的相關性用COS.sim

nonzero = (doc.tfidf != rep(0,10))
nonzeroid = which(row_sums(nonzero) != 0)
q <- rownames(doc.tfidf[nonzeroid,])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]

cos.sim <- function(x, y)
{ 
  (as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y)) 
}

doc.cos <- apply(s.tdm[,1:100], 2, cos.sim,
                 y=as.matrix(s.tdm[,100]))
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
        x = rownames(as.data.frame(orderDoc)),
        y = orderDoc, 
        name = rownames(doc.tfidf)[topID[1]],
        type = "bar", mode= "box")

最後做K-means的圖表

set.seed(150)
kmeansOut <- kmeans(doc.tfidf, 2, nstart = 50)
plot(doc.tfidf, col =(kmeansOut$cluster +1) , main = "k-means result", pch=18, cex=2)

testtfidf <- doc.tfidf
tfidf.pca = prcomp(testtfidf)
biplot(tfidf.pca,color=c(1,11))

hw4

匯入library

在facebook做文本的抓取與字詞清理 找到一些特別的用語並將其加入

做出相關性的指標 因為是一個講述歷史的粉絲專業所以以歷史為關鍵詞

作tf-id 統計圖表

最後做K-means的圖表

在facebook做文本的抓取與字詞清理找到一些特別的用語並將其加入

做出相關性的指標因為是一個講述歷史的粉絲專業所以以歷史為關鍵詞