搜尋美國專利局中 所有權人為台灣大學 之專利案件


url =
wordpage = read_html(url)

xpath ="//table//tr/td[2]/a"
url_content = paste("http://patft.uspto.gov", half_url,sep="")
page1=" "
for( i in 1:50){ 
  patent = read_html( url_content[i])
  target = "//body//p[1]"
  part = xml_find_all(patent,target)%>%xml_text()
  page1 = paste(part,page1)

docs = Corpus(VectorSource(page1))
# Convert the text to lower cas
docs <- tm_map(docs, content_transformer(tolower))
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# 因為法律文獻的書寫方式有固定模式,因此要大量移出不重要的名詞
docs = tm_map(docs,removeWords,c("generate","object","association","connected"," period","includes","shell","provided","first","second","present","end","group","formed","element","comprising","can","the"))

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
print(head(d, 10))
##                word freq
## layer         layer   52
## metal         metal   34
## method       method   33
## electrode electrode   31
## invention invention   24
## gate           gate   21
## region       region   21
## structure structure   20
## substrate substrate   19
## catalyst   catalyst   18
print(wordcloud(words = d$word, freq = d$freq, min.freq = 3,
          max.words=20, random.order=FALSE, rot.per=0.38, 
          colors=brewer.pal(7, "Dark2")))


