itle: “HW4” |
utput: html_document |
收集PTT NBA板上最近320篇貼文與其留言
rm(list = ls(all.names=TRUE))
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs,toSpace, "※")
docs <- tm_map(docs,toSpace, "→")
docs <- tm_map(docs,toSpace, "\n")
docs <- tm_map(docs,toSpace, "的")
docs <- tm_map(docs,toSpace, "就")
docs <- tm_map(docs,toSpace, "吧")
docs <- tm_map(docs,toSpace, "被")
docs <- tm_map(docs,toSpace, "啊")
docs <- tm_map(docs,toSpace, "但")
docs <- tm_map(docs,toSpace, "又")
docs <- tm_map(docs,toSpace, "了")
docs <- tm_map(docs,toSpace, "是")
docs <- tm_map(docs,toSpace, "啦")
docs <- tm_map(docs,toSpace, "都")
docs <- tm_map(docs,toSpace, "你")
docs <- tm_map(docs,toSpace, "我")
docs <- tm_map(docs,toSpace, "很")
docs <- tm_map(docs,toSpace, "也")
docs <- tm_map(docs,toSpace, "嗎")
docs <- tm_map(docs,toSpace, "讓")
docs <- tm_map(docs,toSpace, "和")
docs <- tm_map(docs,toSpace, "推")
docs <- tm_map(docs,toSpace, "噓")
docs <- tm_map(docs,toSpace, "看板")
docs <- tm_map(docs,toSpace, "Oct")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
mixseg = worker()
#mixseg[unlist(docs)]
str(mixseg[unlist(docs)])
## chr [1:324907] "cx" "作者" "seabox" "歐陽" "盒盒" "NBA" ...
jieba_tokenizer = function(d){
unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
freqFrame = freqFrame[order(freqFrame$Freq,decreasing=TRUE), ]
head(freqFrame)
## Var1 Freq
## 1509 不 3163
## 18605 他 2631
## 23240 有 2558
## 23850 在 2472
## 24657 真 1803
## 7674 好 1721
d.corpus <- Corpus(VectorSource(seg))
tdm <- TermDocumentMatrix(d.corpus, control = list(wordLengths = c(2, Inf)))
inspect(tdm)
## <<TermDocumentMatrix (terms: 36055, documents: 320)>>
## Non-/sparse entries: 176889/11360711
## Sparsity : 98%
## Maximal term length: 51
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 108 175 190 220 236 263 299 301 89 94
## 不 82 26 30 31 16 77 56 45 28 64
## 打 3 0 28 39 18 22 56 37 30 50
## 跟 22 5 13 20 22 47 22 16 9 15
## 好 37 23 31 45 20 21 75 27 23 51
## 人 26 16 15 55 47 19 18 9 10 16
## 他 24 6 10 19 19 49 15 5 12 21
## 要 48 14 20 27 19 30 52 34 17 50
## 有 56 12 30 33 41 56 50 27 21 47
## 在 33 14 27 42 20 47 33 30 28 53
## 真 30 21 42 60 49 21 49 46 15 58
查找與「勇士隊」最相關的詞彙和其相關程度,並以圖表呈現。
由直方圖「全票」、「mvp」、「勇士」、「咖哩」、「輸」、「丟人」可看出,網友對於總冠軍的討論,仍圍繞在去年的冠軍賽上(勇士以破紀錄例行賽73勝9負、Curry拿下史上第一個全票通過MVP的成績,不幸在冠軍賽落敗)。
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ass = findAssocs(tdm, "冠軍", 0.45)
g <- unlist(ass)
g <- cbind(names(g), g)
g <- as.data.frame(g)
colnames(g) <- c("關聯詞", "關聯度")
#par(family=("Heiti TC Light"))
G <- ggplot(g, aes(x = 關聯詞, y = 關聯度)) + geom_bar(stat = 'identity') + labs(x = '關聯詞',y = '關聯度')
G <- G + theme(axis.title = element_text(family = "Heiti TC Light"),
axis.text.x = element_text(angle = 60, family = "Heiti TC Light", hjust = 1))
print (G)
##建立TF-IDF矩陣
library(rJava)
library(SnowballC)
library(slam)
library(Matrix)
N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{
log2( N / nnzero(word_doc) )
}
idf <- apply(tdm, 1, idfCal)
doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
for(y in 1:ncol(tdm))
{
doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
}
}
由圖可看出,Curry與其所屬隊伍勇士隊往往會伴隨著有關討論有關總冠軍話題的文章出現。
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
topID = lapply(rownames(as.data.frame(ass)), function(x)
which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
x = as.numeric(colnames(doc.tfidf)),
y = doc.tfidf[topID[1],],
name = rownames(doc.tfidf)[topID[1]],
type = "scatter", mode = "box") %>%
add_trace(y = doc.tfidf[topID[3],],
name = rownames(doc.tfidf)[topID[3]]) %>%
add_trace(y = doc.tfidf[topID[6],],
name = rownames(doc.tfidf)[topID[6]])
##透過 Elbow Method 找出最佳集群數目(optimal number of clusters)
mydata <- t(doc.tfidf)
mydata <- mydata[,apply(mydata, 2, var, na.rm=TRUE) != 0]
wss <- (nrow(mydata)-1) * sum(apply(mydata, 2 ,var))
for (i in 2:15)
wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
plot(1:15, wss, type = "b", xlab = "Number of Clusters",
ylab = "Within groups sum of squares",
main = "Assessing the Optimal Number of Clusters with the Elbow Method",
pch = 20, cex = 2)
利用上面所求得的最佳群集數目(這裡暫時訂為14),將文章作K-Means Clustering
num_clusters = 14
kmeansOut = kmeans(mydata, num_clusters, nstart = 20, algorithm = c("Lloyd"), iter.max = 30)
mydata.pca = prcomp(mydata)
mydata.kmeans = as.factor(kmeansOut$cluster)
kmeans_clustering <- as.data.frame(mydata.kmeans)
kmeans_clustering
## mydata.kmeans
## 1 3
## 2 13
## 3 3
## 4 3
## 5 3
## 6 3
## 7 3
## 8 3
## 9 3
## 10 3
## 11 3
## 12 3
## 13 3
## 14 3
## 15 3
## 16 3
## 17 3
## 18 3
## 19 3
## 20 3
## 21 5
## 22 3
## 23 3
## 24 3
## 25 3
## 26 3
## 27 3
## 28 3
## 29 3
## 30 3
## 31 3
## 32 3
## 33 3
## 34 3
## 35 7
## 36 3
## 37 3
## 38 3
## 39 3
## 40 3
## 41 3
## 42 3
## 43 3
## 44 3
## 45 3
## 46 3
## 47 3
## 48 3
## 49 3
## 50 3
## 51 2
## 52 3
## 53 3
## 54 3
## 55 3
## 56 3
## 57 3
## 58 3
## 59 3
## 60 3
## 61 3
## 62 3
## 63 3
## 64 3
## 65 3
## 66 3
## 67 3
## 68 3
## 69 12
## 70 3
## 71 3
## 72 3
## 73 3
## 74 3
## 75 3
## 76 3
## 77 13
## 78 10
## 79 3
## 80 3
## 81 5
## 82 3
## 83 3
## 84 7
## 85 3
## 86 5
## 87 3
## 88 3
## 89 3
## 90 3
## 91 3
## 92 3
## 93 3
## 94 3
## 95 3
## 96 5
## 97 5
## 98 3
## 99 3
## 100 3
## 101 3
## 102 3
## 103 3
## 104 3
## 105 3
## 106 3
## 107 3
## 108 3
## 109 3
## 110 3
## 111 3
## 112 3
## 113 3
## 114 3
## 115 3
## 116 3
## 117 3
## 118 3
## 119 3
## 120 7
## 121 3
## 122 3
## 123 13
## 124 3
## 125 3
## 126 3
## 127 3
## 128 3
## 129 3
## 130 3
## 131 3
## 132 3
## 133 3
## 134 14
## 135 3
## 136 3
## 137 3
## 138 3
## 139 3
## 140 3
## 141 3
## 142 3
## 143 3
## 144 3
## 145 3
## 146 3
## 147 3
## 148 3
## 149 3
## 150 3
## 151 3
## 152 3
## 153 3
## 154 3
## 155 3
## 156 3
## 157 3
## 158 3
## 159 3
## 160 3
## 161 3
## 162 3
## 163 3
## 164 3
## 165 3
## 166 3
## 167 13
## 168 3
## 169 3
## 170 3
## 171 8
## 172 3
## 173 3
## 174 12
## 175 9
## 176 3
## 177 3
## 178 3
## 179 3
## 180 3
## 181 3
## 182 3
## 183 8
## 184 3
## 185 3
## 186 3
## 187 5
## 188 3
## 189 3
## 190 3
## 191 5
## 192 12
## 193 3
## 194 3
## 195 3
## 196 3
## 197 3
## 198 3
## 199 3
## 200 3
## 201 4
## 202 3
## 203 3
## 204 3
## 205 6
## 206 7
## 207 3
## 208 3
## 209 3
## 210 3
## 211 3
## 212 3
## 213 3
## 214 3
## 215 3
## 216 5
## 217 3
## 218 7
## 219 3
## 220 3
## 221 1
## 222 3
## 223 3
## 224 3
## 225 3
## 226 3
## 227 3
## 228 3
## 229 3
## 230 3
## 231 3
## 232 3
## 233 9
## 234 7
## 235 3
## 236 3
## 237 3
## 238 3
## 239 3
## 240 3
## 241 3
## 242 3
## 243 3
## 244 3
## 245 3
## 246 3
## 247 3
## 248 3
## 249 3
## 250 3
## 251 3
## 252 3
## 253 3
## 254 3
## 255 3
## 256 3
## 257 3
## 258 3
## 259 3
## 260 3
## 261 3
## 262 3
## 263 3
## 264 13
## 265 3
## 266 3
## 267 3
## 268 3
## 269 3
## 270 3
## 271 3
## 272 11
## 273 3
## 274 3
## 275 1
## 276 3
## 277 3
## 278 3
## 279 3
## 280 3
## 281 3
## 282 3
## 283 3
## 284 3
## 285 3
## 286 3
## 287 3
## 288 3
## 289 3
## 290 3
## 291 3
## 292 3
## 293 3
## 294 3
## 295 3
## 296 3
## 297 3
## 298 3
## 299 3
## 300 3
## 301 3
## 302 3
## 303 3
## 304 3
## 305 3
## 306 3
## 307 3
## 308 3
## 309 9
## 310 3
## 311 3
## 312 3
## 313 3
## 314 3
## 315 3
## 316 3
## 317 3
## 318 3
## 319 3
## 320 3
由圖可看出,文章並沒有所謂明顯的分群,推測是由於NBA版為PTT上的某一分類看板,討論的主題已被歸類,因此較難再加以分群。 且由圖可明顯看出一個問題:因為在清理文本時無法有效將非相關的字詞清掉導致資料不夠乾淨(一些網址、貼文者及留言者ID等都會被當成文本處理),因此其準確性仍稍嫌不足。
library(ggbiplot)
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## Loading required package: scales
## Loading required package: grid
library(scales)
library(grid)
g1 <- ggbiplot(mydata.pca, obs.scale = 1, var.scale = 1,
groups = mydata.kmeans, ellipse = TRUE,
circle = TRUE, labels = rownames(mydata))
g1 <- g1 + scale_color_discrete(name = '')
g1 <- g1 + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g1)