itle: “HW4”
utput: html_document

擷取資料

收集PTT NBA板上最近320篇貼文與其留言

清理文本並用jieba套件進行斷詞

rm(list = ls(all.names=TRUE))

library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)

filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files))

toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs,toSpace, "※")
docs <- tm_map(docs,toSpace, "→")
docs <- tm_map(docs,toSpace, "\n")
docs <- tm_map(docs,toSpace, "的")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "吧")
docs <- tm_map(docs,toSpace, "被")
docs <- tm_map(docs,toSpace, "啊")
docs <- tm_map(docs,toSpace, "但")
docs <- tm_map(docs,toSpace, "又")
docs <- tm_map(docs,toSpace, "了")
docs <- tm_map(docs,toSpace, "是")
docs <- tm_map(docs,toSpace, "啦")
docs <- tm_map(docs,toSpace, "都")
docs <- tm_map(docs,toSpace, "你")
docs <- tm_map(docs,toSpace, "我")
docs <- tm_map(docs,toSpace, "很")
docs <- tm_map(docs,toSpace, "也")
docs <- tm_map(docs,toSpace, "嗎")
docs <- tm_map(docs,toSpace, "讓")
docs <- tm_map(docs,toSpace, "和")
docs <- tm_map(docs,toSpace, "推")
docs <- tm_map(docs,toSpace, "噓")
docs <- tm_map(docs,toSpace, "看板")
docs <- tm_map(docs,toSpace, "Oct")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)

mixseg = worker()
#mixseg[unlist(docs)]
str(mixseg[unlist(docs)])
##  chr [1:324907] "cx" "作者" "seabox" "歐陽" "盒盒" "NBA" ...
jieba_tokenizer = function(d){
  unlist(segment(d[[1]],mixseg))
}

建立詞頻矩陣

seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
freqFrame = freqFrame[order(freqFrame$Freq,decreasing=TRUE), ]
head(freqFrame)
##       Var1 Freq
## 1509    不 3163
## 18605   他 2631
## 23240   有 2558
## 23850   在 2472
## 24657   真 1803
## 7674    好 1721

利用corpus套件將文章轉成詞頻矩陣

d.corpus <- Corpus(VectorSource(seg))
tdm <- TermDocumentMatrix(d.corpus, control = list(wordLengths = c(2, Inf)))
inspect(tdm)
## <<TermDocumentMatrix (terms: 36055, documents: 320)>>
## Non-/sparse entries: 176889/11360711
## Sparsity           : 98%
## Maximal term length: 51
## Weighting          : term frequency (tf)
## Sample             :
##      Docs
## Terms 108 175 190 220 236 263 299 301 89 94
##    不  82  26  30  31  16  77  56  45 28 64
##    打   3   0  28  39  18  22  56  37 30 50
##    跟  22   5  13  20  22  47  22  16  9 15
##    好  37  23  31  45  20  21  75  27 23 51
##    人  26  16  15  55  47  19  18   9 10 16
##    他  24   6  10  19  19  49  15   5 12 21
##    要  48  14  20  27  19  30  52  34 17 50
##    有  56  12  30  33  41  56  50  27 21 47
##    在  33  14  27  42  20  47  33  30 28 53
##    真  30  21  42  60  49  21  49  46 15 58

查找與「勇士隊」最相關的詞彙和其相關程度,並以圖表呈現。

由直方圖「全票」、「mvp」、「勇士」、「咖哩」、「輸」、「丟人」可看出,網友對於總冠軍的討論,仍圍繞在去年的冠軍賽上(勇士以破紀錄例行賽73勝9負、Curry拿下史上第一個全票通過MVP的成績,不幸在冠軍賽落敗)。

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
ass = findAssocs(tdm, "冠軍", 0.45)
g <- unlist(ass)
g <- cbind(names(g), g) 
g <- as.data.frame(g)
colnames(g) <- c("關聯詞", "關聯度")
#par(family=("Heiti TC Light"))
G <- ggplot(g, aes(x = 關聯詞, y = 關聯度)) + geom_bar(stat = 'identity') + labs(x = '關聯詞',y = '關聯度')
G <- G + theme(axis.title = element_text(family = "Heiti TC Light"),
          axis.text.x = element_text(angle = 60, family = "Heiti TC Light", hjust = 1))
print (G)

##建立TF-IDF矩陣

library(rJava)
library(SnowballC)
library(slam)
library(Matrix)

N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{
  log2( N / nnzero(word_doc) )
}
idf <- apply(tdm, 1, idfCal)

doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
  for(y in 1:ncol(tdm))
  {
    doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
  }
}

繪製與「冠軍」高度相關的詞的TF-IDF統計圖

由圖可看出,Curry與其所屬隊伍勇士隊往往會伴隨著有關討論有關總冠軍話題的文章出現。

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
topID = lapply(rownames(as.data.frame(ass)), function(x)
  which(rownames(tdm) == x))
topID = unlist(topID)


plot_ly(data = as.data.frame(doc.tfidf),
        x = as.numeric(colnames(doc.tfidf)),
        y = doc.tfidf[topID[1],],
        name = rownames(doc.tfidf)[topID[1]],
        type = "scatter", mode = "box") %>%
  add_trace(y = doc.tfidf[topID[3],],
            name = rownames(doc.tfidf)[topID[3]]) %>%
  add_trace(y = doc.tfidf[topID[6],],
            name = rownames(doc.tfidf)[topID[6]])

##透過 Elbow Method 找出最佳集群數目(optimal number of clusters)

mydata <- t(doc.tfidf)
mydata <- mydata[,apply(mydata, 2, var, na.rm=TRUE) != 0]
wss <- (nrow(mydata)-1) * sum(apply(mydata, 2 ,var))
for (i in 2:15)
  wss[i] <- sum(kmeans(mydata,centers=i)$withinss)

plot(1:15, wss, type = "b", xlab = "Number of Clusters",
     ylab = "Within groups sum of squares",
     main = "Assessing the Optimal Number of Clusters with the Elbow Method",
     pch = 20, cex = 2)

利用上面所求得的最佳群集數目(這裡暫時訂為14),將文章作K-Means Clustering

num_clusters = 14
kmeansOut = kmeans(mydata, num_clusters, nstart = 20, algorithm = c("Lloyd"), iter.max = 30)
mydata.pca = prcomp(mydata)
mydata.kmeans = as.factor(kmeansOut$cluster)

kmeans_clustering <- as.data.frame(mydata.kmeans)
kmeans_clustering
##     mydata.kmeans
## 1               3
## 2              13
## 3               3
## 4               3
## 5               3
## 6               3
## 7               3
## 8               3
## 9               3
## 10              3
## 11              3
## 12              3
## 13              3
## 14              3
## 15              3
## 16              3
## 17              3
## 18              3
## 19              3
## 20              3
## 21              5
## 22              3
## 23              3
## 24              3
## 25              3
## 26              3
## 27              3
## 28              3
## 29              3
## 30              3
## 31              3
## 32              3
## 33              3
## 34              3
## 35              7
## 36              3
## 37              3
## 38              3
## 39              3
## 40              3
## 41              3
## 42              3
## 43              3
## 44              3
## 45              3
## 46              3
## 47              3
## 48              3
## 49              3
## 50              3
## 51              2
## 52              3
## 53              3
## 54              3
## 55              3
## 56              3
## 57              3
## 58              3
## 59              3
## 60              3
## 61              3
## 62              3
## 63              3
## 64              3
## 65              3
## 66              3
## 67              3
## 68              3
## 69             12
## 70              3
## 71              3
## 72              3
## 73              3
## 74              3
## 75              3
## 76              3
## 77             13
## 78             10
## 79              3
## 80              3
## 81              5
## 82              3
## 83              3
## 84              7
## 85              3
## 86              5
## 87              3
## 88              3
## 89              3
## 90              3
## 91              3
## 92              3
## 93              3
## 94              3
## 95              3
## 96              5
## 97              5
## 98              3
## 99              3
## 100             3
## 101             3
## 102             3
## 103             3
## 104             3
## 105             3
## 106             3
## 107             3
## 108             3
## 109             3
## 110             3
## 111             3
## 112             3
## 113             3
## 114             3
## 115             3
## 116             3
## 117             3
## 118             3
## 119             3
## 120             7
## 121             3
## 122             3
## 123            13
## 124             3
## 125             3
## 126             3
## 127             3
## 128             3
## 129             3
## 130             3
## 131             3
## 132             3
## 133             3
## 134            14
## 135             3
## 136             3
## 137             3
## 138             3
## 139             3
## 140             3
## 141             3
## 142             3
## 143             3
## 144             3
## 145             3
## 146             3
## 147             3
## 148             3
## 149             3
## 150             3
## 151             3
## 152             3
## 153             3
## 154             3
## 155             3
## 156             3
## 157             3
## 158             3
## 159             3
## 160             3
## 161             3
## 162             3
## 163             3
## 164             3
## 165             3
## 166             3
## 167            13
## 168             3
## 169             3
## 170             3
## 171             8
## 172             3
## 173             3
## 174            12
## 175             9
## 176             3
## 177             3
## 178             3
## 179             3
## 180             3
## 181             3
## 182             3
## 183             8
## 184             3
## 185             3
## 186             3
## 187             5
## 188             3
## 189             3
## 190             3
## 191             5
## 192            12
## 193             3
## 194             3
## 195             3
## 196             3
## 197             3
## 198             3
## 199             3
## 200             3
## 201             4
## 202             3
## 203             3
## 204             3
## 205             6
## 206             7
## 207             3
## 208             3
## 209             3
## 210             3
## 211             3
## 212             3
## 213             3
## 214             3
## 215             3
## 216             5
## 217             3
## 218             7
## 219             3
## 220             3
## 221             1
## 222             3
## 223             3
## 224             3
## 225             3
## 226             3
## 227             3
## 228             3
## 229             3
## 230             3
## 231             3
## 232             3
## 233             9
## 234             7
## 235             3
## 236             3
## 237             3
## 238             3
## 239             3
## 240             3
## 241             3
## 242             3
## 243             3
## 244             3
## 245             3
## 246             3
## 247             3
## 248             3
## 249             3
## 250             3
## 251             3
## 252             3
## 253             3
## 254             3
## 255             3
## 256             3
## 257             3
## 258             3
## 259             3
## 260             3
## 261             3
## 262             3
## 263             3
## 264            13
## 265             3
## 266             3
## 267             3
## 268             3
## 269             3
## 270             3
## 271             3
## 272            11
## 273             3
## 274             3
## 275             1
## 276             3
## 277             3
## 278             3
## 279             3
## 280             3
## 281             3
## 282             3
## 283             3
## 284             3
## 285             3
## 286             3
## 287             3
## 288             3
## 289             3
## 290             3
## 291             3
## 292             3
## 293             3
## 294             3
## 295             3
## 296             3
## 297             3
## 298             3
## 299             3
## 300             3
## 301             3
## 302             3
## 303             3
## 304             3
## 305             3
## 306             3
## 307             3
## 308             3
## 309             9
## 310             3
## 311             3
## 312             3
## 313             3
## 314             3
## 315             3
## 316             3
## 317             3
## 318             3
## 319             3
## 320             3

利用PCA降維作圖

由圖可看出,文章並沒有所謂明顯的分群,推測是由於NBA版為PTT上的某一分類看板,討論的主題已被歸類,因此較難再加以分群。 且由圖可明顯看出一個問題:因為在清理文本時無法有效將非相關的字詞清掉導致資料不夠乾淨(一些網址、貼文者及留言者ID等都會被當成文本處理),因此其準確性仍稍嫌不足。

library(ggbiplot)
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## Loading required package: scales
## Loading required package: grid
library(scales)
library(grid)

g1 <- ggbiplot(mydata.pca, obs.scale = 1, var.scale = 1, 
              groups = mydata.kmeans, ellipse = TRUE, 
              circle = TRUE, labels = rownames(mydata))
g1 <- g1 + scale_color_discrete(name = '')
g1 <- g1 + theme(legend.direction = 'horizontal', 
               legend.position = 'top')
print(g1)