承接hw3臉書主題,分析連續不同天文本的關係
#import data
source("hw3txt.R")
## Warning: package 'jiebaRD' was built under R version 3.3.3
## Warning: package 'jiebaR' was built under R version 3.3.3
## Warning: package 'wordcloud' was built under R version 3.3.3
#corpus to tdm
d.corpus <- Corpus(VectorSource(seg))
由於作業3文字雲主題,可以看出高雄市長發文裡面,“高雄”的字詞在各篇文章中,占有很高的比例,因此利用相關係數,尋找相關性0.7以上與高雄相關的字,利用位置1的“優質”與位置53的“商機”,來看看各篇文本相關的關係
#沒有做過數學權重的tdm
##到這裡就整理出將鋸子拆開來的矩陣檔
tdm <- TermDocumentMatrix(d.corpus,
control = list(wordLengths = c(1, Inf)))
#View(inspect(tdm[1:9, 1:11]))
ass = findAssocs(tdm, "高雄", 0.70)
ass
## $高雄
## 優質 取得 已 項 產\xab 廠商
## 0.89 0.83 0.79 0.79 0.77 0.77
## 食\xab 農漁 十分重視 大飽口福 工\xb7 及民眾
## 0.76 0.75 0.72 0.72 0.72 0.72
## 戶 戶及 戶到 方面 水產\xab 乎
## 0.72 0.72 0.72 0.72 0.72 0.72
## 市場 民眾 生產 用\xab 向來 安全
## 0.72 0.72 0.72 0.72 0.72 0.72
## 成長 局 把關 決心 迄今已 呷
## 0.72 0.72 0.72 0.72 0.72 0.72
## 定 物產 者 表示 型 政策
## 0.72 0.72 0.72 0.72 0.72 0.72
## 首度 倍並 原來 展今 展及 展以
## 0.72 0.72 0.72 0.72 0.72 0.72
## 展高雄 展售 展覽 展覽\xc0 海洋局 烘焙
## 0.72 0.72 0.72 0.72 0.72 0.72
## 特別 參與 參\xc6 商\xab 商機 國產
## 0.72 0.72 0.72 0.72 0.72 0.72
## 國際飯店 將近 清真 產銷 盛大 規模
## 0.72 0.72 0.72 0.72 0.72 0.72
## 責任 章 備 提高 無限 進入
## 0.72 0.72 0.72 0.72 0.72 0.72
## 進軍 間 項目 飲 僅 搭乘
## 0.72 0.72 0.72 0.72 0.72 0.72
## 誠摯 農及 農委\xb7 農畜產\xab 農\xb7 幕典禮
## 0.72 0.72 0.72 0.72 0.72 0.72
## \xba 滿載而歸 精選 \xbb 輕軌 履歷
## 0.72 0.72 0.72 0.72 0.72 0.72
## 穆斯林 擴增 豐富 證 攤位 歡迎
## 0.72 0.72 0.72 0.72 0.72 0.72
## 驗證 輔導
## 0.72 0.71
畫出 tf-idf 統計圖
# tf-idf computation
N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{
log2( N / nnzero(word_doc) )
}
idf <- apply(tdm, 1, idfCal)
doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
for(y in 1:ncol(tdm))
{
doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
}
}
# 畫出 tf-idf 統計圖
library(plotly)
## Warning: package 'plotly' was built under R version 3.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
topID = lapply(rownames(as.data.frame(ass)), function(x)
which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
x = as.numeric(colnames(doc.tfidf)),
y = doc.tfidf[topID[1],],
name = rownames(doc.tfidf)[topID[1]],
type = "scatter", mode= "box") %>%
add_trace(y = doc.tfidf[topID[53],],
name = rownames(doc.tfidf)[topID[53]])
## Warning: package 'bindrcpp' was built under R version 3.3.3
於是發現,第10篇文章,是“優質”與“商機”最相關的文章;而第6篇與第10篇文本都是與優質相關的文章
# get short doc matrix
nonzero = (doc.tfidf != rep(0,11))
nonzeroid = which(row_sums(nonzero) != 0)
q <- rownames(doc.tfidf[nonzeroid,])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]
View(s.tdm)
# result : cos similarity ranking
cos.sim <- function(x, y)
{
(as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y))
}
doc.cos <- apply(s.tdm[,1:11], 2, cos.sim,
y=as.matrix(s.tdm[,11]))
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
x = rownames(as.data.frame(orderDoc)),
y = orderDoc,
name = rownames(doc.tfidf)[topID[1]],
type = "bar", mode= "box")
## Warning: 'bar' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'
藉由圖發現第11篇與第8篇文本的相關性可能是最高的,而回顧文本內容,也可以看出這兩篇主要內容是與國際接軌有關係的文章
str(doc.tfidf)
## num [1:928, 1:11] 0 0.00154 0.03887 0.02106 0.03887 ...
## - attr(*, "dimnames")=List of 2
## ..$ Terms: chr [1:928] "c" "\xa4" "又" "大家" ...
## ..$ Docs : chr [1:11] "1" "2" "3" "4" ...
summary(doc.tfidf)
## 1 2 3
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.002721 Mean :0.002604 Mean :0.002514
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.110536 Max. :0.075865 Max. :0.060692
## 4 5 6
## Min. :0.00000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.00000 Median :0.000000 Median :0.000000
## Mean :0.00219 Mean :0.002772 Mean :0.002396
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.20350 Max. :0.108107 Max. :0.091519
## 7 8 9
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.002604 Mean :0.002788 Mean :0.002705
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.135664 Max. :0.116774 Max. :0.092498
## 10 11
## Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000
## Mean :0.002463 Mean :0.002578
## 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :0.070601 Max. :0.104831
set.seed(11)#因為有11篇文章
kmeansOut <- kmeans(doc.tfidf, 2, nstart = 50)
plot(doc.tfidf, col =(kmeansOut$cluster +1) , main="article analysis", pch=18, cex=2)
mydata <- doc.tfidf
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 1:20) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",
main="Assessing the Optimal Number of Clusters with the Elbow Method",
pch=20, cex=2)
set.seed(11)
km2 = kmeans(mydata, 13, nstart=50)
# Examine the result of the clustering algorithm
plot(mydata, col =c(1:13), main="K-Means result with 13 clusters",pch = 20 ,cex=2)
legend('topright',c('國際、比賽','整治、改造','漁業','產學、地方發展','處理','藝術','經濟、優質','高雄綜合發展','土地','服務、幸福','表演','宗教信仰','醫療服務'),col = c(1:13),pch = 20,bty='n', cex=.75)
1.一開始先分成兩群畫圖看看,發現組間/組內的變異很小
2.藉由找到最佳分群的方法,分成13群,可以看出組間/組內的變異變大,因此分群現象清楚 但由於kmeans抓出來的字詞是從各篇文章抓出相近的一些字詞,發現出現的頻率相近,故根據分群的結果,做判斷後,給定比較大的分類項目名稱,分類過程中,明顯發現分群到第四與第六與第九群的字詞非常的少,發現整治、改造與產學、及地方發產有比較高的關係
#install.packages("devtools")
library(devtools)
#install_github("ggbiplot","vqv")
library(scales)
library(grid)
library(ggbiplot)
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
testTfidf = doc.tfidf
tfidf.pca <- prcomp(testTfidf)
#讓文字顯示成中文
biplot(tfidf.pca)
發現經過降維之後,抓下來的11篇文章中以,第一篇、第四篇、第五篇分群現象最明顯