
#import data
## Warning: package 'jiebaRD' was built under R version 3.3.3
## Warning: package 'jiebaR' was built under R version 3.3.3
## Warning: package 'wordcloud' was built under R version 3.3.3
#corpus to tdm
d.corpus <- Corpus(VectorSource(seg))


tdm <- TermDocumentMatrix(d.corpus, 
       control = list(wordLengths = c(1, Inf)))
#View(inspect(tdm[1:9, 1:11]))

ass = findAssocs(tdm, "高雄", 0.70)
畫出 tf-idf 統計圖

# tf-idf computation
N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
  log2( N / nnzero(word_doc) ) 
idf <- apply(tdm, 1, idfCal)

doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
  for(y in 1:ncol(tdm))
    doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]

# 畫出 tf-idf 統計圖
topID = lapply(rownames(as.data.frame(ass)), function(x) 
  which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
        x = as.numeric(colnames(doc.tfidf)),
        y = doc.tfidf[topID[1],], 
        name = rownames(doc.tfidf)[topID[1]],
        type = "scatter", mode= "box") %>%
add_trace(y = doc.tfidf[topID[53],],
          name = rownames(doc.tfidf)[topID[53]])
再來是看文章之間 的相關性,利用cos similarity來看

# get short doc matrix
nonzero = (doc.tfidf != rep(0,11))
nonzeroid = which(row_sums(nonzero) != 0)
q <- rownames(doc.tfidf[nonzeroid,])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]

# result : cos similarity ranking

cos.sim <- function(x, y)
  (as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y)) 

doc.cos <- apply(s.tdm[,1:11], 2, cos.sim,
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
        x = rownames(as.data.frame(orderDoc)),
        y = orderDoc, 
        name = rownames(doc.tfidf)[topID[1]],
        type = "bar", mode= "box")
##  num [1:928, 1:11] 0 0.00154 0.03887 0.02106 0.03887 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Terms: chr [1:928] "c" "\xa4" "又" "大家" ...
##   ..$ Docs : chr [1:11] "1" "2" "3" "4" ...
##        1                  2                  3           
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.002721   Mean   :0.002604   Mean   :0.002514  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.110536   Max.   :0.075865   Max.   :0.060692  
##        4                 5                  6           
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.000000   Median :0.000000  
##  Mean   :0.00219   Mean   :0.002772   Mean   :0.002396  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.20350   Max.   :0.108107   Max.   :0.091519  
##        7                  8                  9           
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.002604   Mean   :0.002788   Mean   :0.002705  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.135664   Max.   :0.116774   Max.   :0.092498  
##        10                 11          
##  Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000  
##  Mean   :0.002463   Mean   :0.002578  
##  3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.070601   Max.   :0.104831
kmeansOut <- kmeans(doc.tfidf, 2, nstart = 50)
plot(doc.tfidf, col =(kmeansOut$cluster +1) , main="article analysis", pch=18, cex=2)

mydata <- doc.tfidf
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 1:20) wss[i] <- sum(kmeans(mydata,
plot(1:20, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares",
     main="Assessing the Optimal Number of Clusters with the Elbow Method",
     pch=20, cex=2)

km2 = kmeans(mydata, 13, nstart=50)

# Examine the result of the clustering algorithm

plot(mydata,  col =c(1:13), main="K-Means result with 13 clusters",pch =  20 ,cex=2)
legend('topright',c('國際、比賽','整治、改造','漁業','產學、地方發展','處理','藝術','經濟、優質','高雄綜合發展','土地','服務、幸福','表演','宗教信仰','醫療服務'),col = c(1:13),pch = 20,bty='n', cex=.75)



2.藉由找到最佳分群的方法,分成13群,可以看出組間/組內的變異變大,因此分群現象清楚 但由於kmeans抓出來的字詞是從各篇文章抓出相近的一些字詞,發現出現的頻率相近,故根據分群的結果,做判斷後,給定比較大的分類項目名稱,分類過程中,明顯發現分群到第四與第六與第九群的字詞非常的少,發現整治、改造與產學、及地方發產有比較高的關係


testTfidf = doc.tfidf
tfidf.pca <- prcomp(testTfidf)

