文本抓取和清理(賴院長11月的PO文)

library(httr)
library(rjson)
library(httpuv)
library(Rfacebook)
## 
## Attaching package: 'Rfacebook'
## The following object is masked from 'package:methods':
## 
##     getGroup
library(plyr)
library(NLP)
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
library(tm)
library(rvest)
## Loading required package: xml2
library(xml2)
library(SnowballC)
library(slam)
library(Matrix)
prefex = "https://graph.facebook.com/v2.10/"
token  = "EAACEdEose0cBAKPzxtoU8USbCMGNbPFpd1aLZB2bCWquhAF6YvpCqY2S4MwIgoqLG3eVyA2CZCoQk19U35KchZBJwh4ljZBuQL3WNg5vE8b1SEMAuMPL5eTqpanuhZBKPs4ZB4ZCrOpKhvNVzsOo0MhETGbr9rw0mGmcgJxZC2S6fa7uF1P8ZBnnZCL8oz2IozjlNqTWGPmMeo2wZDZD"
number = 1
attrs  = paste0("152472278103133/posts?limit=",number,"&until=2017-11-30&since=2017-11-1&access_token=")
url    = paste0(prefex, attrs, token)
res    = httr::GET(url)
data =   httr::content(res)
groups= matrix(unlist(data$data))
filename = paste0(1, ".txt")
write.table(groups,filename)
after  = data$paging$cursors$after
nextflg= data$paging[2]
count=1
while(nextflg!= "NULL"){
  count=count+1
  attrs  = paste0("152472278103133/posts?limit=1&until=2017-11-30&since=2017-11-1&after=",after,"&access_token=")
  url = paste0(prefex,attrs,token)
  nextres= httr::GET(url)
  ndata  = httr::content(nextres)
  ngroups= matrix(unlist(ndata$data))
  after  = ndata$paging$cursors$after
  nextflg = ndata$paging[3]
  
  filename = paste0(count, ".txt")
  
  write.table(ngroups,filename)
}
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
par(family='STKaiti')
filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files))
toSpace <- content_transformer(function(x, pattern) {
  return (gsub(pattern, " ", x))
}
)
docs <- tm_map(docs, toSpace, "※")
docs <- tm_map(docs, toSpace, "◆")
docs <- tm_map(docs, toSpace, "‧")
docs <- tm_map(docs, toSpace, "的")
docs <- tm_map(docs, toSpace, "我")
docs <- tm_map(docs, toSpace, "也")
docs <- tm_map(docs, toSpace, "他")
docs <- tm_map(docs, toSpace, "是")
docs <- tm_map(docs, toSpace, "就")
docs <- tm_map(docs, toSpace, "你")
docs <- tm_map(docs, toSpace, "啊")
docs <- tm_map(docs, toSpace, "嗎")
docs <- tm_map(docs, toSpace, "啦")
docs <- tm_map(docs, toSpace, "要")
docs <- tm_map(docs, toSpace, "有")
docs <- tm_map(docs, toSpace, "及")
docs <- tm_map(docs, toSpace, "了")
docs <- tm_map(docs, toSpace, "在")
docs <- tm_map(docs, toSpace, "但")
docs <- tm_map(docs, toSpace, "都")
docs <- tm_map(docs, toSpace, "哈")
docs <- tm_map(docs, toSpace, "不")
docs <- tm_map(docs, toSpace, "與")
docs <- tm_map(docs, toSpace, "什麼")
docs <- tm_map(docs, toSpace, "一個")
docs <- tm_map(docs, toSpace, "們")
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
mixseg = worker()
segment = c("賴清德","勞動基準法","台南","行政院","一例一休","勞基法")
new_user_word(mixseg,segment)
## [1] TRUE
jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))

d.corpus <- Corpus(VectorSource(seg))
tdm <- TermDocumentMatrix(d.corpus, 
                          control = list(wordLengths = c(1, Inf)))

因為對於勞工問題有著極高的興趣,所以找尋和勞工有著高度關係的詞語,運用兩關鍵詞“權益”“保障”進行分析

labor = findAssocs(tdm, "勞工", 0.8)
labor
## $勞工
##         休     例\xa4       修法       彈性       權益       二日 
##       0.97       0.97       0.97       0.97       0.97       0.95 
##       週休       輪班     機\xb7       勞動       小時       保障 
##       0.95       0.95       0.95       0.94       0.92       0.92 
##         文     勞動部         集         又         分       全體 
##       0.90       0.90       0.90       0.89       0.89       0.89 
##       改變       間隔     運\xa7       調整       賦予         變 
##       0.89       0.89       0.89       0.89       0.89       0.89 
##       勞資       原則       經濟       七休     三班制         工 
##       0.89       0.85       0.85       0.85       0.85       0.85 
##       工時     之所以       以利       充耳       出於       打工 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       未休     正\xb1       休如       休假   全文如下       各方 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       各站       各國         吃     多萬個       收入     而言之 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##     完可經       完則       形式       把關       更忙       更累 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       每日       每月       每週         見       依靠         刻 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       協議       或者       承擔         拚       放下         金 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       冠軍         卻     客\xc6       建言       思維       施行 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##         既       為數         看       看到       看待       衍生 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       計算       限制     修正案       修改         兼       家計 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       捍衛       時工         特       班費         真   衷心希望 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##         除     商\xb7       基層       接受         推       排班 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##     第二份     第二個       責任       備詢 勞動基準法   勞資雙方 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       喪失     幾萬家       無非       絕非       虛擬       視而 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       須經       僅只 圓滿成\xa5         彙     意\xa5       極端 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##         經     經部門       綁住       資方       運用       對立 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       管道         聞         整         應       還休       隱形 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       歸納   歸納\xb0       難道       嚴重     議\xb7       黨派 
##       0.85       0.85       0.85       0.85       0.85       0.85 
##       變以       體制         個       經營         蔡       指示 
##       0.85       0.85       0.84       0.84       0.81       0.81 
##       給予       預告         雖 
##       0.81       0.81       0.81
N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{ 
  log2( N / nnzero(word_doc) ) 
}
idf <- apply(tdm, 1, idfCal)


doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
  for(y in 1:ncol(tdm))
  {
    doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
  }
}
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
topID = lapply(rownames(as.data.frame(labor)), function(x) 
  which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
        x = as.numeric(colnames(doc.tfidf)),
        y = doc.tfidf[topID[5],], 
        name = rownames(doc.tfidf)[topID[5]],
        type = "scatter", mode= "box") %>%
  add_trace(y = doc.tfidf[topID[12],],
            name = rownames(doc.tfidf)[topID[12]])

#第1,19,29篇是權益與保障比較有相關的篇章

用長條圖找尋35篇文章的相關性

pp = (doc.tfidf != rep(0,35))
ppid = which(row_sums(pp) != 0)
q <- rownames(doc.tfidf[ppid,])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]
cos.sim <- function(x, y)
{ 
  (as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y)) 
}
doc.cos <- apply(s.tdm[,1:35], 2, cos.sim,
                 y=as.matrix(s.tdm[,35]))
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
        x = rownames(as.data.frame(orderDoc)),
        y = orderDoc, 
        name = rownames(doc.tfidf)[topID[3]],
        type = "bar", mode= "box")
## Warning: 'bar' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'

找尋分群現象

str(doc.tfidf)
##  num [1:1838, 1:35] 0 0.0114 0.0216 0.0216 0.0216 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Terms: chr [1:1838] "c" "\xa4" "二日" "入" ...
##   ..$ Docs : chr [1:35] "1" "2" "3" "4" ...
summary(doc.tfidf)
##        1                  2                  3           
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001715   Mean   :0.001884   Mean   :0.001822  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.081918   Max.   :0.219826   Max.   :0.160290  
##        4                  5                 6           
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.00000   Median :0.000000  
##  Mean   :0.001804   Mean   :0.00194   Mean   :0.001969  
##  3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :0.188921   Max.   :0.10259   Max.   :0.220614  
##        7                  8                  9           
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001955   Mean   :0.001512   Mean   :0.001619  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.178410   Max.   :0.127813   Max.   :0.148603  
##        10                 11                 12          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001898   Mean   :0.001607   Mean   :0.001954  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.157473   Max.   :0.317637   Max.   :0.124376  
##        13                 14                 15          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001615   Mean   :0.001658   Mean   :0.002108  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.184286   Max.   :0.267150   Max.   :0.169097  
##        16                 17                 18          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001752   Mean   :0.001583   Mean   :0.001581  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.107607   Max.   :0.141773   Max.   :0.165461  
##        19                 20                 21          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001877   Mean   :0.002102   Mean   :0.001714  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.100512   Max.   :0.179975   Max.   :0.149235  
##        22                 23                 24          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001994   Mean   :0.001533   Mean   :0.001999  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.347094   Max.   :0.180129   Max.   :0.270773  
##        25                 26                 27          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001662   Mean   :0.002041   Mean   :0.002036  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.160290   Max.   :0.172413   Max.   :0.306880  
##        28                 29                 30          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001722   Mean   :0.001731   Mean   :0.001911  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.088436   Max.   :0.072584   Max.   :0.284960  
##        31                 32                 33          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.002033   Mean   :0.001539   Mean   :0.001716  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.266456   Max.   :0.120689   Max.   :0.269962  
##        34                 35          
##  Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000  
##  Mean   :0.001827   Mean   :0.001845  
##  3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.155964   Max.   :0.176872
set.seed(35)
result = kmeans(doc.tfidf, 2, nstart = 50)
plot(doc.tfidf, col =(result$cluster +1) , main="K-MEAN", pch=15, cex=2)

testtfidf = doc.tfidf
tfidf.pca = prcomp(testtfidf)
biplot(tfidf.pca,color=c(1,35))
## Warning in plot.window(...): "color" 不是一個繪圖參數
## Warning in plot.xy(xy, type, ...): "color" 不是一個繪圖參數
## Warning in axis(side = side, at = at, labels = labels, ...): "color" 不是一
## 個繪圖參數

## Warning in axis(side = side, at = at, labels = labels, ...): "color" 不是一
## 個繪圖參數
## Warning in box(...): "color" 不是一個繪圖參數
## Warning in title(...): "color" 不是一個繪圖參數
## Warning in text.default(x, xlabs, cex = cex[1L], col = col[1L], ...):
## "color" 不是一個繪圖參數
## Warning in plot.window(...): "color" 不是一個繪圖參數
## Warning in plot.xy(xy, type, ...): "color" 不是一個繪圖參數
## Warning in title(...): "color" 不是一個繪圖參數
## Warning in axis(3, col = col[2L], ...): "color" 不是一個繪圖參數
## Warning in axis(4, col = col[2L], ...): "color" 不是一個繪圖參數
## Warning in text.default(y, labels = ylabs, cex = cex[2L], col = col[2L], :
## "color" 不是一個繪圖參數

第18,22,30,分群現象明顯