library(readr)
## Warning: package 'readr' was built under R version 3.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
data = read.csv('all.csv',header = TRUE)%>%as.data.frame()
d = data
#移除遺失值
for (i in 1:ncol(data)){
  d = d [ d[,i]!= '-',] 
}
for(i in c(3:14)){
  d[,i] =as.numeric(as.character(d[,i]))  
  }

a=na.omit(d)
quan1 =quantile(a$'國小學生輟學率',0.33,na.rm = TRUE)
quan2 =quantile(a$'國小學生輟學率',0.66,na.rm = TRUE)

a = mutate(a,elementary = ' ')
a$elementary [which(a$'國小學生輟學率'< quan1)]='低'
a$elementary [which(a$'國小學生輟學率'>=quan2)]='高'
a$elementary [which(a$elementary!='高' & a$elementary!='低')]='中'
a$elementary =factor(a$elementary,levels = c('低','中','高'))

#輟學可能會讓兒童犯罪率升高
ggplot(data = a, aes(x = a$elementary, y =a$'兒童犯罪人口率' )) +
  geom_boxplot() + coord_flip() +
  labs( x= '國小學生輟學情況', y ='兒童犯罪人口率' , 
        title = '國小輟學率與兒童犯罪人口率')

print(cor(a$'國小學生輟學率', y =a$'兒童犯罪人口率'))#高度相關
## [1] 0.6926309
#輟學與國小師生比無明顯關係
ggplot(data = a, aes(x = a$elementary, y =a$'平均每一教師教導學生數.國小')) +
  geom_boxplot() +coord_flip() +
  labs( x= '國小學生輟學情況', y ='國小師生比' , 
        title = '國小師生比')

print(cor(a$'平均每一教師教導學生數.國小', y =a$'兒童犯罪人口率'))#僅為中度相關
## [1] -0.4790172
#anova檢定(H0:國小輟學率與兒童犯罪率無關)
test1 =data.frame(a$elementary, a$'兒童犯罪人口率')
aov.1 = aov(test1[,2]~test1[,1],data = test1)
summary(aov.1)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## test1[, 1]   2   9460    4730   20.28 5.19e-08 ***
## Residuals   91  21221     233                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pr(>F)極小,故不接受H0假設
ggplot(data = test1, 
       aes(x =a.elementary, y = test1[,2]))+
  stat_summary(fun.data = 'mean_cl_boot', size = 1) +
  geom_hline(yintercept = mean(test1$a.elementary) , 
             linetype = 'dotted') +
  labs(x = '國小輟學率', y = '兒童犯罪率人數') +
  coord_flip()
## Warning in mean.default(test1$a.elementary): argument is not numeric or
## logical: returning NA
## Warning: Removed 1 rows containing missing values (geom_hline).

qplot(x=d$'國中學生輟學率',                               
      y=d$'國小學生輟學率',                              
      data=d,                      
      geom="point",                         # 圖形=scatter plot
      main = "國小/中學生輟學率與原住民人口關係圖",  
      xlab="國中學生輟學率",                          
      ylab="國小學生輟學率",                    
      color=d$'原住民人口比'                       
)
## Warning: Removed 7 rows containing missing values (geom_point).

#由此圖可以觀察到兩件事,
#一是國小輟學率高的話國中輟學率必定高,反之則不一定
#二是圖形右上方的顏色幾乎都偏淺,推測原住民小孩可能是影響輟學率的一大原因
qplot(x=d$X.1,                               
      y=d$'原住民人口比',                              
      data=d,                      
      geom="point",                         # 圖形=scatter plot
      main = "各縣市與原住民人口關係圖",  
      xlab="縣市",                      # X軸的名稱
      ylab="原住民人口")
## Warning: Removed 7 rows containing missing values (geom_point).

#我們把依照原住民人口比例分成兩組,用中位數分開
c=na.omit(d)
half =quantile(d$'原住民人口比',0.5,na.rm = TRUE)
c = mutate(c,origin = ' ')
c$origin [which(c$'原住民人口比'> half)]='多'
c$origin [which(c$origin!='多')]='少'
c$origin =factor(c$origin,levels = c('少','多'))
#t-test(Ho假設:原住民人口多寡不會影響輟學率)
t.test(c$'國小學生輟學率'~c$origin)
## 
##  Welch Two Sample t-test
## 
## data:  c$國小學生輟學率 by c$origin
## t = -5.235, df = 57.085, p-value = 2.465e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04176931 -0.01865622
## sample estimates:
## mean in group 少 mean in group 多 
##       0.02893617       0.05914894
t.test(c$'國中學生輟學率'~c$origin)
## 
##  Welch Two Sample t-test
## 
## data:  c$國中學生輟學率 by c$origin
## t = -8.9368, df = 66.319, p-value = 5.408e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4084034 -0.2592561
## sample estimates:
## mean in group 少 mean in group 多 
##        0.3557447        0.6895745
#兩者皆 p-value > 0.05 故不接受Ho假說
ggplot(data = c, aes(x = c$origin, y =c$'國小學生輟學率' )) +
  geom_boxplot() + coord_flip() +
  labs( y= '國小學生輟學情況', x ='原住民比例' , 
        title = '國小輟學率與原住民人口比率關係圖')

ggplot(data = c, aes(x = c$origin, y =c$'國中學生輟學率' )) +
  geom_boxplot() + coord_flip() +
  labs( y= '國中學生輟學情況', x ='原住民比例' , 
        title = '國中輟學率與原住民人口比率關係圖')

#從相關系書來看也有一樣結果
cor(a$'國小學生輟學率', y =a$'原住民人口比')
## [1] 0.8182739
cor(a$'國中學生輟學率', y =a$'原住民人口比')
## [1] 0.7611649