PTT Boy-Girl 分析
作者SpringLin (春春)
看板Boy-Girl
標題[求助] 真的是我價值觀問題嗎?
時間Mon Mar 26 03:10:20 2018
發信站: 批踢踢實業坊(ptt.cc), 來自: 27.52.197.190
文章網址: https://www.ptt.cc/bbs/Boy-Girl/M.1522005022.A.21D.html
作者Nianyi (かっばちゃん)
看板Boy-Girl
標題[心情] 被抱有好感的男性友人說978
時間Thu Mar 29 03:09:41 2018
發信站: 批踢踢實業坊(ptt.cc), 來自: 118.167.38.54
文章網址: https://www.ptt.cc/bbs/Boy-Girl/M.1522264183.A.BA2.html
作者skyline5557 (廢青)
看板Boy-Girl
標題[心情] 再給我們兩年的時間
時間Thu Mar 29 00:06:24 2018
發信站: 批踢踢實業坊(ptt.cc), 來自: 101.12.165.71
文章網址: https://www.ptt.cc/bbs/Boy-Girl/M.1522253186.A.1BD.html
- 載入所需的套件包
library(bitops)
library(httr)
library(RCurl)
library(XML)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
library(NLP)
library(tmcn)
## # tmcn Version: 0.2-8
library(jiebaRD)
library(jiebaR)
- PTT 網路爬蟲抓出所有文章內文所對應的網址
from <- 3860 # 2018-03-25
to <- 3874 # 2018-03-31
prefix = "https://www.ptt.cc/bbs/Boy-Girl/index"
data <- list()
for( id in c(from:to) )
{
url <- paste0( prefix, as.character(id), ".html" )
html <- htmlParse( GET(url) )
url.list <- xpathSApply( html, "//div[@class='title']/a[@href]", xmlAttrs )
data <- rbind( data, as.matrix(paste('https://www.ptt.cc', url.list, sep='')) )
}
data <- unlist(data)
head(data)
- 利用所有文章的網址去抓所有文章內文, 並解析出文章的內容並依照 hour 合併儲存。
library(dplyr)
getdoc <- function(url)
{
html <- htmlParse( getURL(url) )
doc <- xpathSApply( html, "//div[@id='main-content']", xmlValue )
time <- xpathSApply( html, "//*[@id='main-content']/div[4]/span[2]", xmlValue )
temp <- gsub( " ", " 0", unlist(time) )
part <- strsplit( temp, split=" ", fixed=T )
#date <- paste(part[[1]][2], part[[1]][3], part[[1]][5], sep="-")
#date <- paste(part[[1]][2], part[[1]][5], sep="_")
#date <- paste(part[[1]][1], part[[1]][2], sep="_")
timestamp <- part[[1]][4]
timestamp <- strsplit( timestamp, split=":", fixed=T )
hour <- timestamp[[1]][1]
#print(hour)
name <- paste0('./DATA/', hour, ".txt")
write(doc, name, append = TRUE)
}
sapply(data, getdoc)
- 建立文本資料結構與基本文字清洗
d.corpus <- Corpus( DirSource("./DATA") )
d.corpus <- tm_map(d.corpus, removePunctuation)
d.corpus <- tm_map(d.corpus, removeNumbers)
d.corpus <- tm_map(d.corpus, function(word) {
gsub("[A-Za-z0-9]", "", word)
})
- 進行斷詞,並依照日期建立文本矩陣 TermDocumentMatrix
mixseg = worker()
jieba_tokenizer = function(d)
{
unlist( segment(d[[1]], mixseg) )
}
seg = lapply(d.corpus, jieba_tokenizer)
count_token = function(d)
{
as.data.frame(table(d))
}
tokens = lapply(seg, count_token)
n = length(seg)
TDM = tokens[[1]]
colNames <- names(seg)
colNames <- gsub(".txt", "", colNames)
for( id in c(2:n) )
{
TDM = merge(TDM, tokens[[id]], by="d", all = TRUE)
names(TDM) = c('d', colNames[1:id])
}
TDM[is.na(TDM)] <- 0
library(knitr)
kable(head(TDM))
ㄇ |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
4 |
0 |
0 |
2 |
0 |
0 |
ㄉ |
2 |
1 |
0 |
2 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
3 |
0 |
0 |
0 |
1 |
0 |
ㄊ |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
ㄌ |
1 |
0 |
1 |
5 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
ㄟ |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
1 |
0 |
0 |
2 |
2 |
0 |
0 |
0 |
0 |
ㄧ |
4 |
0 |
3 |
9 |
0 |
1 |
1 |
2 |
5 |
0 |
1 |
2 |
0 |
4 |
5 |
7 |
0 |
7 |
1 |
1 |
3 |
1 |
0 |
6 |
kable(tail(TDM))
24261 |
曬圖 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
24262 |
邏 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
24263 |
罐 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
24264 |
羈押 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
24265 |
讓我 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
24266 |
靈機一動 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
- 將已建好的 TDM 轉成 TF-IDF
tf <- apply(as.matrix(TDM[,2:(n+1)]), 2, sum)
library(Matrix)
idfCal <- function(word_doc)
{
log2( n / nnzero(word_doc) )
}
idf <- apply(as.matrix(TDM[,2:(n+1)]), 1, idfCal)
doc.tfidf <- TDM
# for(x in 1:nrow(TDM))
# {
# for(y in 2:ncol(TDM))
# {
# doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]
# }
# }
tempY = matrix(rep(c(as.matrix(tf)), each = length(idf)), nrow = length(idf))
tempX = matrix(rep(c(as.matrix(idf)), each = length(tf)), ncol = length(tf), byrow = TRUE)
doc.tfidf[,2:(n+1)] <- (doc.tfidf[,2:(n+1)] / tempY) * tempX
stopLine = rowSums(doc.tfidf[,2:(n+1)])
delID = which(stopLine == 0)
kable(head(doc.tfidf[delID,1]))
kable(tail(doc.tfidf[delID,1]))
TDM = TDM[-delID,]
doc.tfidf = doc.tfidf[-delID,]
- TF-IDF Hours 文章取得的重要關鍵字
TopWords = data.frame()
for( id in c(1:n) )
{
dayMax = order(doc.tfidf[,id+1], decreasing = TRUE)
showResult = t(as.data.frame(doc.tfidf[dayMax[1:5],1]))
TopWords = rbind(TopWords, showResult)
}
rownames(TopWords) = colnames(doc.tfidf)[2:(n+1)]
TopWords = droplevels(TopWords)
kable(TopWords)
00 |
環保 |
憂鬱症 |
婦產科 |
驗孕 |
憂鬱 |
01 |
沒吵 |
註定 |
不吵 |
過架 |
及格線 |
02 |
髒 |
內褲 |
水桶 |
生日 |
違規 |
03 |
糾正 |
紐約 |
錯字 |
持有 |
選字 |
04 |
星座 |
受益人 |
羯 |
初衷 |
婚後 |
05 |
扶正 |
報應 |
沈玉琳 |
小三 |
元配 |
06 |
固酮 |
睪 |
濃度 |
懶人 |
求站 |
07 |
嫖 |
統計 |
嫖妓 |
慾望 |
濫交 |
08 |
高鐵 |
黏 |
重機 |
嘔 |
貼心 |
09 |
團體 |
愚人節 |
火力 |
年月日 |
執行 |
10 |
先買 |
現場 |
場次 |
排隊 |
買票 |
11 |
持有 |
信任 |
心寒 |
姐妹 |
再問 |
12 |
你媽 |
痛恨 |
罩杯 |
吃不下 |
圍巾 |
13 |
重機 |
汽車 |
騎 |
憂鬱症 |
家長 |
14 |
憂鬱症 |
憂鬱 |
吃藥 |
房租 |
租金 |
15 |
石頭火鍋 |
火鍋 |
小開 |
星座 |
房租 |
16 |
父親 |
錯字 |
宴客 |
糾正 |
房租 |
17 |
堂哥 |
客家人 |
貼圖 |
學妹 |
停止 |
18 |
錯字 |
錯成 |
糾正 |
出戲 |
選字 |
19 |
阿公 |
加班 |
洋蔥 |
阿嬤 |
騙過 |
20 |
白目 |
侮辱 |
當眾 |
專業 |
質疑 |
21 |
出國 |
旅遊業 |
鏡頭 |
嫉妒 |
攝影 |
22 |
隨緣 |
撲 |
問卷 |
撲倒 |
佛法 |
23 |
第一張 |
巧克力 |
甜點 |
超美 |
隨緣 |
- TF-IDF Hours 文章取得的重要關鍵字 TDM merge 視覺化
TDM$d = as.character(TDM$d)
AllTop = as.data.frame( table(as.matrix(TopWords)) )
AllTop = AllTop[order(AllTop$Freq, decreasing = TRUE),]
kable(head(AllTop))
35 |
房租 |
3 |
36 |
糾正 |
3 |
86 |
憂鬱症 |
3 |
94 |
錯字 |
3 |
42 |
持有 |
2 |
43 |
星座 |
2 |
TopNo = 5
tempGraph = data.frame()
for( t in c(1:TopNo) )
{
word = matrix( rep(c(as.matrix(AllTop$Var1[t])), each = n), nrow = n )
temp = cbind( colnames(doc.tfidf)[2:(n+1)], t(TDM[which(TDM$d == AllTop$Var1[t]), 2:(n+1)]), word )
colnames(temp) = c("hour", "freq", "words")
tempGraph = rbind(tempGraph, temp)
names(tempGraph) = c("hour", "freq", "words")
}
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(varhandle)
tempGraph$freq = unfactor(tempGraph$freq)
ggplot(tempGraph, aes(hour, freq)) +
geom_point(aes(color = words, shape = words), size = 5) +
geom_line(aes(group = words, linetype = words))
kable(tail(AllTop))
101 |
懶人 |
1 |
102 |
鏡頭 |
1 |
103 |
騙過 |
1 |
104 |
攝影 |
1 |
105 |
髒 |
1 |
106 |
驗孕 |
1 |
- 發文時間與發文量
filenames = as.array(paste0("./DATA/",colnames(doc.tfidf)[2:(n+1)],".txt"))
sizeResult = apply(filenames, 1, file.size) / 1024
showSize = data.frame(colnames(doc.tfidf)[2:(n+1)], sizeResult)
names(showSize) = c("hour", "size_KB")
ggplot(showSize, aes(x = hour, y = size_KB)) + geom_bar(stat="identity")