hw4 - 用tfidf分類專利文件

library(NLP)
Sys.setenv(JAVA_HOME="C:/Program Files/Java/jdk1.8.0_144/")
library(rJava)
library(slam)
library(Matrix)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

source('readfromtxt.R')

## Loading required package: xml2

## # tmcn Version: 0.2-8

#用之前爬蟲到的專利文件之claim做分類依據

tdm <- TermDocumentMatrix(docs,control = list(wordLengths = c(2, Inf)))
ass1 = findAssocs(tdm, "method", 0.5)
print(ass1)

## $method
## templat perform     may 
##    0.53    0.52    0.51

ass2 = findAssocs(tdm, "invent", 0.5)
print(ass2)

## $invent
##   present invention     three   subject   summari   without     cells 
##      0.70      0.70      0.59      0.58      0.55      0.53      0.53 
##      test     prior     brief     adult     solid  proteins   infiltr 
##      0.53      0.52      0.51      0.51      0.51      0.51      0.50

#可看出申請方法專利之間的相關性較發明專利少
# tf-idf computation
N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{ 
  log2( N / nnzero(word_doc) ) 
}
idf <- apply(tdm, 1, idfCal)


doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
  for(y in 1:ncol(tdm))
  {
    doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
  }
}

# 畫出 tf-idf 統計圖(用剛剛的ass2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

topID = lapply(rownames(as.data.frame(ass2)), function(x) 
  which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
        x = as.numeric(colnames(doc.tfidf)),
        y = doc.tfidf[topID[3],], 
        name = rownames(doc.tfidf)[topID[1]],
        type = "scatter", mode= "box") %>%
  add_trace(y = doc.tfidf[topID[2],],
            name = rownames(doc.tfidf)[topID[4]])

## Warning: Ignoring 5 observations

## Warning: Ignoring 5 observations

#get short doc matrix
q <- rownames(doc.tfidf[c(1:nrow(doc.tfidf)),])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]

# result : cos similarity ranking
cos.sim <- function(x, y)
{ 
  (as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y)) 
}
doc.cos <- apply(s.tdm[,1:100], 2, cos.sim,
                 y=as.matrix(s.tdm[,43]))
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
        x = rownames(as.data.frame(orderDoc)),
        y = orderDoc, 
        name = rownames(doc.tfidf)[topID],
        type = "bar", mode= "box")

## Warning: Ignoring 5 observations

## Warning: 'bar' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'

#第43號專利與其他專利的同質性較高，稍微看了一下文件，可以發現都是工程物理類的文件。

# Kmeans 分群

library(stats)
#因為IPC(依技術分類的一種標準)共有8大類，故以8類為分群標準
testTfidf = doc.tfidf
testTfidf= testTfidf[,-c(35,48,66,67,86)]
testTfidf =testTfidf[-nrow(testTfidf),]
testTfidf = t(testTfidf)
set.seed(55)
kmeansOut <- kmeans(testTfidf, 8, nstart = 50)
tfidf.kmeans =as.factor(kmeansOut$cluster)
kmeans_result = as.data.frame(tfidf.kmeans)
print(kmeans_result)#結果與剛剛做的cos similarity 不太相合

##     tfidf.kmeans
## 1              1
## 2              1
## 3              8
## 4              1
## 5              1
## 6              5
## 7              1
## 8              1
## 9              1
## 10             2
## 11             1
## 12             1
## 13             1
## 14             1
## 15             1
## 16             1
## 17             1
## 18             1
## 19             1
## 20             7
## 21             1
## 22             1
## 23             1
## 24             4
## 25             1
## 26             1
## 27             1
## 28             1
## 29             1
## 30             1
## 31             1
## 32             3
## 33             1
## 34             1
## 36             1
## 37             1
## 38             1
## 39             5
## 40             1
## 41             1
## 42             1
## 43             1
## 44             1
## 45             1
## 46             1
## 47             1
## 49             1
## 50             1
## 51             1
## 52             1
## 53             1
## 54             1
## 55             1
## 56             1
## 57             1
## 58             1
## 59             5
## 60             5
## 61             1
## 62             1
## 63             1
## 64             1
## 65             1
## 68             1
## 69             1
## 70             1
## 71             1
## 72             1
## 73             1
## 74             1
## 75             1
## 76             5
## 77             1
## 78             1
## 79             1
## 80             8
## 81             1
## 82             1
## 83             1
## 84             1
## 85             1
## 87             1
## 88             4
## 89             1
## 90             1
## 91             1
## 92             7
## 93             1
## 94             5
## 95             1
## 96             1
## 97             1
## 98             1
## 99             1
## 100            1
## 101            6

#繪圖觀察
tfidf.pca <- prcomp(testTfidf)
library(ggbiplot)

## Loading required package: plyr

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## Loading required package: scales

## Loading required package: grid

g <- ggbiplot(tfidf.pca, obs.scale = 1, var.scale = 1, 
              groups = tfidf.kmeans, ellipse = TRUE, 
              circle = TRUE, labels = rownames(testTfidf))
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal', 
               legend.position = 'top')
#似乎只能看出明顯兩大區
print(g)

hw4 - 用tfidf分類專利文件

翁嬿婷 生機三 B04608042

翁嬿婷生機三 B04608042