library(NLP)
Sys.setenv(JAVA_HOME="C:/Program Files/Java/jdk1.8.0_144/")
library(rJava)
library(slam)
library(Matrix)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
source('readfromtxt.R')
## Loading required package: xml2
## # tmcn Version: 0.2-8
#用之前爬蟲到的專利文件之claim做分類依據
tdm <- TermDocumentMatrix(docs,control = list(wordLengths = c(2, Inf)))
ass1 = findAssocs(tdm, "method", 0.5)
print(ass1)
## $method
## templat perform may
## 0.53 0.52 0.51
ass2 = findAssocs(tdm, "invent", 0.5)
print(ass2)
## $invent
## present invention three subject summari without cells
## 0.70 0.70 0.59 0.58 0.55 0.53 0.53
## test prior brief adult solid proteins infiltr
## 0.53 0.52 0.51 0.51 0.51 0.51 0.50
#可看出申請方法專利之間的相關性較發明專利少
# tf-idf computation
N = tdm$ncol
tf <- apply(tdm, 2, sum)
idfCal <- function(word_doc)
{
log2( N / nnzero(word_doc) )
}
idf <- apply(tdm, 1, idfCal)
doc.tfidf <- as.matrix(tdm)
for(x in 1:nrow(tdm))
{
for(y in 1:ncol(tdm))
{
doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
}
}
# 畫出 tf-idf 統計圖(用剛剛的ass2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
topID = lapply(rownames(as.data.frame(ass2)), function(x)
which(rownames(tdm) == x))
topID = unlist(topID)
plot_ly(data = as.data.frame(doc.tfidf),
x = as.numeric(colnames(doc.tfidf)),
y = doc.tfidf[topID[3],],
name = rownames(doc.tfidf)[topID[1]],
type = "scatter", mode= "box") %>%
add_trace(y = doc.tfidf[topID[2],],
name = rownames(doc.tfidf)[topID[4]])
## Warning: Ignoring 5 observations
## Warning: Ignoring 5 observations
#get short doc matrix
q <- rownames(doc.tfidf[c(1:nrow(doc.tfidf)),])
all.term <- rownames(doc.tfidf)
loc <- which(all.term %in% q)
s.tdm <- doc.tfidf[loc,]
# result : cos similarity ranking
cos.sim <- function(x, y)
{
(as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y))
}
doc.cos <- apply(s.tdm[,1:100], 2, cos.sim,
y=as.matrix(s.tdm[,43]))
orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]
plot_ly(data = as.data.frame(orderDoc),
x = rownames(as.data.frame(orderDoc)),
y = orderDoc,
name = rownames(doc.tfidf)[topID],
type = "bar", mode= "box")
## Warning: Ignoring 5 observations
## Warning: 'bar' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'
#第43號專利與其他專利的同質性較高,稍微看了一下文件,可以發現都是工程物理類的文件。
# Kmeans 分群
library(stats)
#因為IPC(依技術分類的一種標準)共有8大類,故以8類為分群標準
testTfidf = doc.tfidf
testTfidf= testTfidf[,-c(35,48,66,67,86)]
testTfidf =testTfidf[-nrow(testTfidf),]
testTfidf = t(testTfidf)
set.seed(55)
kmeansOut <- kmeans(testTfidf, 8, nstart = 50)
tfidf.kmeans =as.factor(kmeansOut$cluster)
kmeans_result = as.data.frame(tfidf.kmeans)
print(kmeans_result)#結果與剛剛做的cos similarity 不太相合
## tfidf.kmeans
## 1 1
## 2 1
## 3 8
## 4 1
## 5 1
## 6 5
## 7 1
## 8 1
## 9 1
## 10 2
## 11 1
## 12 1
## 13 1
## 14 1
## 15 1
## 16 1
## 17 1
## 18 1
## 19 1
## 20 7
## 21 1
## 22 1
## 23 1
## 24 4
## 25 1
## 26 1
## 27 1
## 28 1
## 29 1
## 30 1
## 31 1
## 32 3
## 33 1
## 34 1
## 36 1
## 37 1
## 38 1
## 39 5
## 40 1
## 41 1
## 42 1
## 43 1
## 44 1
## 45 1
## 46 1
## 47 1
## 49 1
## 50 1
## 51 1
## 52 1
## 53 1
## 54 1
## 55 1
## 56 1
## 57 1
## 58 1
## 59 5
## 60 5
## 61 1
## 62 1
## 63 1
## 64 1
## 65 1
## 68 1
## 69 1
## 70 1
## 71 1
## 72 1
## 73 1
## 74 1
## 75 1
## 76 5
## 77 1
## 78 1
## 79 1
## 80 8
## 81 1
## 82 1
## 83 1
## 84 1
## 85 1
## 87 1
## 88 4
## 89 1
## 90 1
## 91 1
## 92 7
## 93 1
## 94 5
## 95 1
## 96 1
## 97 1
## 98 1
## 99 1
## 100 1
## 101 6
#繪圖觀察
tfidf.pca <- prcomp(testTfidf)
library(ggbiplot)
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## Loading required package: scales
## Loading required package: grid
g <- ggbiplot(tfidf.pca, obs.scale = 1, var.scale = 1,
groups = tfidf.kmeans, ellipse = TRUE,
circle = TRUE, labels = rownames(testTfidf))
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
#似乎只能看出明顯兩大區
print(g)