登革熱病例分析

套件安裝

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute

讀入資料

Dengue <- read_csv("C:/Users/angel/Desktop/github/Dengue.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   year = col_integer(),
##   month = col_integer(),
##   city = col_character(),
##   gender = col_character(),
##   foreign = col_logical(),
##   age = col_character(),
##   count = col_integer()
## )
Dengue = Dengue[,-1]
attach(Dengue)
Dengue$city = as.factor(city)
Dengue$gender = as.factor(gender)
Dengue$foreign = as.factor(foreign)
Dengue$age = as.factor(age)
head(Dengue)
## # A tibble: 6 x 7
##    year month   city gender foreign    age count
##   <int> <int> <fctr> <fctr>  <fctr> <fctr> <int>
## 1  2003     1 台中市      M    TRUE  10-14     1
## 2  2003     1 台中市      M    TRUE  55-59     1
## 3  2003     1 台北市      F    TRUE  35-39     1
## 4  2003     1 台北市      M    TRUE  35-39     1
## 5  2003     1 台南市      F   FALSE  55-59     1
## 6  2003     1 台南市      F   FALSE  65-69     1

SVM分析

預測此病例是否為境外移入

testID = sample(1:nrow(Dengue),100,replace=FALSE)
x = subset(Dengue[testID,],select = -foreign)
y = Dengue$foreign[testID]
svm_model = svm(foreign ~., data = Dengue[-testID,])
pred = predict(svm_model, x)
t = table(pred,y)
t
##        y
## pred    FALSE TRUE
##   FALSE    47    9
##   TRUE      8   36

計算此模型的正確率

ac = (t[1,1]+t[2,2])/100
ac
## [1] 0.83

相關係數與迴歸分析

計算年月與得登革熱人數的相關係數

cy = Dengue %>% group_by(year) %>% summarise(count=sum(count))
cor(cy$count,cy$year)
## [1] 0.387132
cm = Dengue %>% group_by(month) %>% summarise(count=sum(count))
cor(cm$count,cm$month)
## [1] 0.7338041

由結果可知兩者皆與得登革熱人數呈正相關,畫出迴歸圖形看看

result1 = lm(cy$count ~ cy$year)
plot(cy$year, cy$count, pch = 16, cex = 1.3, col = "blue",
     xlab = "Year", ylab = "Number of Patient")
abline(result1)

result2 = lm(cm$count ~ cm$month)
plot(cm$month, cm$count, pch = 16, cex = 1.3, col = "blue",
     xlab = "Month", ylab = "Number of Patient")
abline(result2)

由圖形可看出,得登革熱人數在2014和2015年飆高,月份上則是從七八月開始增加,到十二月又開始減少

T檢定

我以性別對得登革熱人數的影響進行T檢定,首先先畫出盒型圖,由於人數分布較為分散,故我取了log

cg = Dengue %>% group_by(year,month,gender) %>% summarise(count=sum(count))
require(ggplot2)
ggplot(data = cg, aes(x = gender, y = log10(count))) +
  geom_boxplot() + coord_flip() +
  labs( y = 'Log10 of Patient Number', x = 'gender')

接著進行T檢定

t.test(count ~ gender, data = cg)
## 
##  Welch Two Sample t-test
## 
## data:  count by gender
## t = 0.022859, df = 347.85, p-value = 0.9818
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -176.7707  180.9281
## sample estimates:
## mean in group F mean in group M 
##        207.9253        205.8466

p-value為0.9818 > 0.05,故無法否認虛無假說,即男女得到登革熱的人數是一樣的

ANOVA檢定

分析年齡對得登革熱人數的影響,首先定出年齡層的順序

ca = Dengue %>% group_by(year,month,age) %>% summarise(count=sum(count))
ca$age <- factor(ca$age, levels = c('0','1','2','3','4',
                                            '5-9','10-14','15-19',
                                            '20-24','25-29','30-34',
                                            '35-39','40-44','45-49',
                                            '50-54','55-59','60-64',
                                            '65-69','70+'))

看不同年齡層平均每月得登革熱的人數,加上信賴區間

ggplot(data = ca, 
       aes(x = age, y = count)) +
  stat_summary(fun.data = 'mean_cl_boot', size = 1) +
  scale_y_continuous(breaks = seq(500, 660, by = 20)) +
  geom_hline(yintercept = mean(ca$age) , 
             linetype = 'dotted') +
  labs(x = '年齡層', y = '每月得登革熱的人數') +
  coord_flip()
## Warning in mean.default(ca$age): argument is not numeric or logical:
## returning NA
## Warning: Removed 1 rows containing missing values (geom_hline).

從圖形看起來,得登革熱人數隨年齡升高而增加,且年齡越高人數分布越分散,用ANOVA檢定看看

anova(a <- lm(count ~ age, data = ca))
## Analysis of Variance Table
## 
## Response: count
##             Df   Sum Sq Mean Sq F value Pr(>F)
## age         18   425393   23633  1.1927 0.2579
## Residuals 1957 38777206   19815

從p-value為0.2579 > 0.05可知,年齡對得登革熱人數的影響不大

接下來分析地區對得登革熱人數的影響

cc = Dengue %>% group_by(year,month,city) %>% summarise(count=sum(count))
anova(c <- lm(count ~ city, data = cc))
## Analysis of Variance Table
## 
## Response: count
##             Df    Sum Sq Mean Sq F value    Pr(>F)    
## city        21  12442825  592515  2.3744 0.0004486 ***
## Residuals 1477 368580581  249547                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

由p-value = 0.0004486 < 0.05可知,不同地區得登革熱人數確有差異,畫圖看看

ggplot(data = cc, 
       aes(x = city, y = count)) +
  stat_summary(fun.data = 'mean_cl_boot', size = 1) +
  scale_y_continuous(breaks = seq(500, 660, by = 20)) +
  geom_hline(yintercept = mean(cc$city) , 
             linetype = 'dotted') +
  labs(x = '縣市', y = '每月得登革熱的人數') +
  coord_flip()
## Warning in mean.default(cc$city): argument is not numeric or logical:
## returning NA
## Warning: Removed 1 rows containing missing values (geom_pointrange).
## Warning: Removed 1 rows containing missing values (geom_hline).

由圖可知,高雄市和台南市得登革熱的人數明顯高於其他地區