library(httr)
library(rjson)
library(httpuv)
library(Rfacebook)
##
## Attaching package: 'Rfacebook'
## The following object is masked from 'package:methods':
##
## getGroup
library(plyr)
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
library(tm)
library(rvest)
## Loading required package: xml2
library(xml2)
library(SnowballC)
library(slam)
library(Matrix)
prefex = "https://graph.facebook.com/v2.10/"
token = "EAACEdEose0cBAKPzxtoU8USbCMGNbPFpd1aLZB2bCWquhAF6YvpCqY2S4MwIgoqLG3eVyA2CZCoQk19U35KchZBJwh4ljZBuQL3WNg5vE8b1SEMAuMPL5eTqpanuhZBKPs4ZB4ZCrOpKhvNVzsOo0MhETGbr9rw0mGmcgJxZC2S6fa7uF1P8ZBnnZCL8oz2IozjlNqTWGPmMeo2wZDZD"
number = 1
attrs = paste0("152472278103133/posts?limit=",number,"&until=2017-11-30&since=2017-11-1&access_token=")
url = paste0(prefex, attrs, token)
res = httr::GET(url)
data = httr::content(res)
groups= matrix(unlist(data$data))
filename = paste0(1, ".txt")
write.table(groups,filename)
after = data$paging$cursors$after
nextflg= data$paging[2]
count=1
while(nextflg!= "NULL"){
count=count+1
attrs = paste0("152472278103133/posts?limit=1&until=2017-11-30&since=2017-11-1&after=",after,"&access_token=")
url = paste0(prefex,attrs,token)
nextres= httr::GET(url)
ndata = httr::content(nextres)
ngroups= matrix(unlist(ndata$data))
after = ndata$paging$cursors$after
nextflg = ndata$paging[3]
filename = paste0(count, ".txt")
write.table(ngroups,filename)
}
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
par(family='STKaiti')
filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files))
toSpace <- content_transformer(function(x, pattern) {
return (gsub(pattern, " ", x))
}
)
docs <- tm_map(docs, toSpace, "※")
docs <- tm_map(docs, toSpace, "◆")
docs <- tm_map(docs, toSpace, "‧")
docs <- tm_map(docs, toSpace, "的")
docs <- tm_map(docs, toSpace, "我")
docs <- tm_map(docs, toSpace, "也")
docs <- tm_map(docs, toSpace, "他")
docs <- tm_map(docs, toSpace, "是")
docs <- tm_map(docs, toSpace, "就")
docs <- tm_map(docs, toSpace, "你")
docs <- tm_map(docs, toSpace, "啊")
docs <- tm_map(docs, toSpace, "嗎")
docs <- tm_map(docs, toSpace, "啦")
docs <- tm_map(docs, toSpace, "要")
docs <- tm_map(docs, toSpace, "有")
docs <- tm_map(docs, toSpace, "及")
docs <- tm_map(docs, toSpace, "了")
docs <- tm_map(docs, toSpace, "在")
docs <- tm_map(docs, toSpace, "但")
docs <- tm_map(docs, toSpace, "都")
docs <- tm_map(docs, toSpace, "哈")
docs <- tm_map(docs, toSpace, "不")
docs <- tm_map(docs, toSpace, "與")
docs <- tm_map(docs, toSpace, "什麼")
docs <- tm_map(docs, toSpace, "一個")
docs <- tm_map(docs, toSpace, "們")
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
mixseg = worker()
segment = c("賴清德","勞動基準法","台南","行政院","一例一休","勞基法")
new_user_word(mixseg,segment)
## [1] TRUE
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
d.corpus <- Corpus(VectorSource(seg))
tdm <- TermDocumentMatrix(d.corpus,
control = list(wordLengths = c(1, Inf)))
labor = findAssocs(tdm, "勞工", 0.8)
labor
## $勞工
## 休 例\xa4 修法 彈性 權益 二日
## 0.97 0.97 0.97 0.97 0.97 0.95
## 週休 輪班 機\xb7 勞動 小時 保障
## 0.95 0.95 0.95 0.94 0.92 0.92
## 文 勞動部 集 又 分 全體
## 0.90 0.90 0.90 0.89 0.89 0.89
## 改變 間隔 運\xa7 調整 賦予 變
## 0.89 0.89 0.89 0.89 0.89 0.89
## 勞資 原則 經濟 七休 三班制 工
## 0.89 0.85 0.85 0.85 0.85 0.85
## 工時 之所以 以利 充耳 出於 打工
## 0.85 0.85 0.85 0.85 0.85 0.85
## 未休 正\xb1 休如 休假 全文如下 各方
## 0.85 0.85 0.85 0.85 0.85 0.85
## 各站 各國 吃 多萬個 收入 而言之
## 0.85 0.85 0.85 0.85 0.85 0.85
## 完可經 完則 形式 把關 更忙 更累
## 0.85 0.85 0.85 0.85 0.85 0.85
## 每日 每月 每週 見 依靠 刻
## 0.85 0.85 0.85 0.85 0.85 0.85
## 協議 或者 承擔 拚 放下 金
## 0.85 0.85 0.85 0.85 0.85 0.85
## 冠軍 卻 客\xc6 建言 思維 施行
## 0.85 0.85 0.85 0.85 0.85 0.85
## 既 為數 看 看到 看待 衍生
## 0.85 0.85 0.85 0.85 0.85 0.85
## 計算 限制 修正案 修改 兼 家計
## 0.85 0.85 0.85 0.85 0.85 0.85
## 捍衛 時工 特 班費 真 衷心希望
## 0.85 0.85 0.85 0.85 0.85 0.85
## 除 商\xb7 基層 接受 推 排班
## 0.85 0.85 0.85 0.85 0.85 0.85
## 第二份 第二個 責任 備詢 勞動基準法 勞資雙方
## 0.85 0.85 0.85 0.85 0.85 0.85
## 喪失 幾萬家 無非 絕非 虛擬 視而
## 0.85 0.85 0.85 0.85 0.85 0.85
## 須經 僅只 圓滿成\xa5 彙 意\xa5 極端
## 0.85 0.85 0.85 0.85 0.85 0.85
## 經 經部門 綁住 資方 運用 對立
## 0.85 0.85 0.85 0.85 0.85 0.85
## 管道 聞 整 應 還休 隱形
## 0.85 0.85 0.85 0.85 0.85 0.85
## 歸納 歸納\xb0 難道 嚴重 議\xb7 黨派
## 0.85 0.85 0.85 0.85 0.85 0.85
## 變以 體制 個 經營 蔡 指示
## 0.85 0.85 0.84 0.84 0.81 0.81
## 給予 預告 雖
## 0.81 0.81 0.81
N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{
log2( N / nnzero(word_doc) )
}
idf <- apply(tdm, 1, idfCal)
doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
for(y in 1:ncol(tdm))
{
doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
}
}
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:httr':
##
## config
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
topID = lapply(rownames(as.data.frame(labor)), function(x)
which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
x = as.numeric(colnames(doc.tfidf)),
y = doc.tfidf[topID[5],],
name = rownames(doc.tfidf)[topID[5]],
type = "scatter", mode= "box") %>%
add_trace(y = doc.tfidf[topID[12],],
name = rownames(doc.tfidf)[topID[12]])
#第1,19,29篇是權益與保障比較有相關的篇章
pp = (doc.tfidf != rep(0,35))
ppid = which(row_sums(pp) != 0)
q <- rownames(doc.tfidf[ppid,])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]
cos.sim <- function(x, y)
{
(as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y))
}
doc.cos <- apply(s.tdm[,1:35], 2, cos.sim,
y=as.matrix(s.tdm[,35]))
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
x = rownames(as.data.frame(orderDoc)),
y = orderDoc,
name = rownames(doc.tfidf)[topID[3]],
type = "bar", mode= "box")
## Warning: 'bar' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'
找尋分群現象
str(doc.tfidf)
## num [1:1838, 1:35] 0 0.0114 0.0216 0.0216 0.0216 ...
## - attr(*, "dimnames")=List of 2
## ..$ Terms: chr [1:1838] "c" "\xa4" "二日" "入" ...
## ..$ Docs : chr [1:35] "1" "2" "3" "4" ...
summary(doc.tfidf)
## 1 2 3
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001715 Mean :0.001884 Mean :0.001822
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.081918 Max. :0.219826 Max. :0.160290
## 4 5 6
## Min. :0.000000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.000000 Median :0.00000 Median :0.000000
## Mean :0.001804 Mean :0.00194 Mean :0.001969
## 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :0.188921 Max. :0.10259 Max. :0.220614
## 7 8 9
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001955 Mean :0.001512 Mean :0.001619
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.178410 Max. :0.127813 Max. :0.148603
## 10 11 12
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001898 Mean :0.001607 Mean :0.001954
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.157473 Max. :0.317637 Max. :0.124376
## 13 14 15
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001615 Mean :0.001658 Mean :0.002108
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.184286 Max. :0.267150 Max. :0.169097
## 16 17 18
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001752 Mean :0.001583 Mean :0.001581
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.107607 Max. :0.141773 Max. :0.165461
## 19 20 21
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001877 Mean :0.002102 Mean :0.001714
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.100512 Max. :0.179975 Max. :0.149235
## 22 23 24
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001994 Mean :0.001533 Mean :0.001999
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.347094 Max. :0.180129 Max. :0.270773
## 25 26 27
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001662 Mean :0.002041 Mean :0.002036
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.160290 Max. :0.172413 Max. :0.306880
## 28 29 30
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001722 Mean :0.001731 Mean :0.001911
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.088436 Max. :0.072584 Max. :0.284960
## 31 32 33
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.002033 Mean :0.001539 Mean :0.001716
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.266456 Max. :0.120689 Max. :0.269962
## 34 35
## Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000
## Mean :0.001827 Mean :0.001845
## 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.155964 Max. :0.176872
set.seed(35)
result = kmeans(doc.tfidf, 2, nstart = 50)
plot(doc.tfidf, col =(result$cluster +1) , main="K-MEAN", pch=15, cex=2)
testtfidf = doc.tfidf
tfidf.pca = prcomp(testtfidf)
biplot(tfidf.pca,color=c(1,35))
## Warning in plot.window(...): "color" 不是一個繪圖參數
## Warning in plot.xy(xy, type, ...): "color" 不是一個繪圖參數
## Warning in axis(side = side, at = at, labels = labels, ...): "color" 不是一
## 個繪圖參數
## Warning in axis(side = side, at = at, labels = labels, ...): "color" 不是一
## 個繪圖參數
## Warning in box(...): "color" 不是一個繪圖參數
## Warning in title(...): "color" 不是一個繪圖參數
## Warning in text.default(x, xlabs, cex = cex[1L], col = col[1L], ...):
## "color" 不是一個繪圖參數
## Warning in plot.window(...): "color" 不是一個繪圖參數
## Warning in plot.xy(xy, type, ...): "color" 不是一個繪圖參數
## Warning in title(...): "color" 不是一個繪圖參數
## Warning in axis(3, col = col[2L], ...): "color" 不是一個繪圖參數
## Warning in axis(4, col = col[2L], ...): "color" 不是一個繪圖參數
## Warning in text.default(y, labels = ylabs, cex = cex[2L], col = col[2L], :
## "color" 不是一個繪圖參數
第18,22,30,分群現象明顯