library(readr)
## Warning: package 'readr' was built under R version 3.4.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
data = read.csv('all.csv',header = TRUE)%>%as.data.frame()
d = data
#移除遺失值
for (i in 1:ncol(data)){
d = d [ d[,i]!= '-',]
}
for(i in c(3:14)){
d[,i] =as.numeric(as.character(d[,i]))
}
a=na.omit(d)
quan1 =quantile(a$'國小學生輟學率',0.33,na.rm = TRUE)
quan2 =quantile(a$'國小學生輟學率',0.66,na.rm = TRUE)
a = mutate(a,elementary = ' ')
a$elementary [which(a$'國小學生輟學率'< quan1)]='低'
a$elementary [which(a$'國小學生輟學率'>=quan2)]='高'
a$elementary [which(a$elementary!='高' & a$elementary!='低')]='中'
a$elementary =factor(a$elementary,levels = c('低','中','高'))
#輟學可能會讓兒童犯罪率升高
ggplot(data = a, aes(x = a$elementary, y =a$'兒童犯罪人口率' )) +
geom_boxplot() + coord_flip() +
labs( x= '國小學生輟學情況', y ='兒童犯罪人口率' ,
title = '國小輟學率與兒童犯罪人口率')
print(cor(a$'國小學生輟學率', y =a$'兒童犯罪人口率'))#高度相關
## [1] 0.6926309
#輟學與國小師生比無明顯關係
ggplot(data = a, aes(x = a$elementary, y =a$'平均每一教師教導學生數.國小')) +
geom_boxplot() +coord_flip() +
labs( x= '國小學生輟學情況', y ='國小師生比' ,
title = '國小師生比')
print(cor(a$'平均每一教師教導學生數.國小', y =a$'兒童犯罪人口率'))#僅為中度相關
## [1] -0.4790172
#anova檢定(H0:國小輟學率與兒童犯罪率無關)
test1 =data.frame(a$elementary, a$'兒童犯罪人口率')
aov.1 = aov(test1[,2]~test1[,1],data = test1)
summary(aov.1)
## Df Sum Sq Mean Sq F value Pr(>F)
## test1[, 1] 2 9460 4730 20.28 5.19e-08 ***
## Residuals 91 21221 233
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pr(>F)極小,故不接受H0假設
ggplot(data = test1,
aes(x =a.elementary, y = test1[,2]))+
stat_summary(fun.data = 'mean_cl_boot', size = 1) +
geom_hline(yintercept = mean(test1$a.elementary) ,
linetype = 'dotted') +
labs(x = '國小輟學率', y = '兒童犯罪率人數') +
coord_flip()
## Warning in mean.default(test1$a.elementary): argument is not numeric or
## logical: returning NA
## Warning: Removed 1 rows containing missing values (geom_hline).
qplot(x=d$'國中學生輟學率',
y=d$'國小學生輟學率',
data=d,
geom="point", # 圖形=scatter plot
main = "國小/中學生輟學率與原住民人口關係圖",
xlab="國中學生輟學率",
ylab="國小學生輟學率",
color=d$'原住民人口比'
)
## Warning: Removed 7 rows containing missing values (geom_point).
#由此圖可以觀察到兩件事,
#一是國小輟學率高的話國中輟學率必定高,反之則不一定
#二是圖形右上方的顏色幾乎都偏淺,推測原住民小孩可能是影響輟學率的一大原因
qplot(x=d$X.1,
y=d$'原住民人口比',
data=d,
geom="point", # 圖形=scatter plot
main = "各縣市與原住民人口關係圖",
xlab="縣市", # X軸的名稱
ylab="原住民人口")
## Warning: Removed 7 rows containing missing values (geom_point).
#我們把依照原住民人口比例分成兩組,用中位數分開
c=na.omit(d)
half =quantile(d$'原住民人口比',0.5,na.rm = TRUE)
c = mutate(c,origin = ' ')
c$origin [which(c$'原住民人口比'> half)]='多'
c$origin [which(c$origin!='多')]='少'
c$origin =factor(c$origin,levels = c('少','多'))
#t-test(Ho假設:原住民人口多寡不會影響輟學率)
t.test(c$'國小學生輟學率'~c$origin)
##
## Welch Two Sample t-test
##
## data: c$國小學生輟學率 by c$origin
## t = -5.235, df = 57.085, p-value = 2.465e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04176931 -0.01865622
## sample estimates:
## mean in group 少 mean in group 多
## 0.02893617 0.05914894
t.test(c$'國中學生輟學率'~c$origin)
##
## Welch Two Sample t-test
##
## data: c$國中學生輟學率 by c$origin
## t = -8.9368, df = 66.319, p-value = 5.408e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4084034 -0.2592561
## sample estimates:
## mean in group 少 mean in group 多
## 0.3557447 0.6895745
#兩者皆 p-value > 0.05 故不接受Ho假說
ggplot(data = c, aes(x = c$origin, y =c$'國小學生輟學率' )) +
geom_boxplot() + coord_flip() +
labs( y= '國小學生輟學情況', x ='原住民比例' ,
title = '國小輟學率與原住民人口比率關係圖')
ggplot(data = c, aes(x = c$origin, y =c$'國中學生輟學率' )) +
geom_boxplot() + coord_flip() +
labs( y= '國中學生輟學情況', x ='原住民比例' ,
title = '國中輟學率與原住民人口比率關係圖')
#從相關系書來看也有一樣結果
cor(a$'國小學生輟學率', y =a$'原住民人口比')
## [1] 0.8182739
cor(a$'國中學生輟學率', y =a$'原住民人口比')
## [1] 0.7611649