library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
dfev <- read.csv("fev.csv")
dfev <- dfev[,c(2:4,6)]
想要看新生兒肺功能指數是否會受到年齡、周遭抽菸因子的影響 畫出年齡與肺功能的散布圖,並加上受到的不同抽菸狀態的影響
g<-ggplot(data=dfev,aes(x =FEV , y = Age))
g1 <- g + geom_point(aes(color = Smoker))
g1
H0:肺功能不會受到年齡、抽菸與否的影響
m1 <- lm(FEV~Age+Smoker,data=dfev)
summary(m1)
##
## Call:
## lm(formula = FEV ~ Age + Smoker, data = dfev)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6653 -0.3564 -0.0508 0.3494 2.0894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.158378 0.131003 1.209 0.22712
## Age 0.230605 0.008184 28.176 < 2e-16 ***
## SmokerNon 0.208995 0.080745 2.588 0.00986 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5651 on 651 degrees of freedom
## Multiple R-squared: 0.5766, Adjusted R-squared: 0.5753
## F-statistic: 443.3 on 2 and 651 DF, p-value: < 2.2e-16
發現年紀與是否抽菸的p-value均<0.05,固可推測二者確實會對新生兒的肺功能有影響
進一步看看抽菸與年齡是否會產生交互作用,對新生兒的肺功能有影響
m2 <- lm(FEV~Age*Smoker,data=dfev)
summary(m2)
##
## Call:
## lm(formula = FEV ~ Age * Smoker, data = dfev)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.76645 -0.34947 -0.03364 0.33679 2.05990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.19697 0.40596 5.412 8.78e-08 ***
## Age 0.07986 0.02959 2.699 0.00713 **
## SmokerNon -1.94357 0.41428 -4.691 3.31e-06 ***
## Age:SmokerNon 0.16270 0.03074 5.293 1.65e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5537 on 650 degrees of freedom
## Multiple R-squared: 0.5941, Adjusted R-squared: 0.5922
## F-statistic: 317.1 on 3 and 650 DF, p-value: < 2.2e-16
發現年齡與抽菸的交互作用項,其p-value<0.05,固可推測二者會產生交互作用並且對新生兒的肺功能有影響
#建模型
x <- subset(dfev,select = -Smoker)
y <- dfev$Smoker
model <- svm(Smoker~.,data = dfev,cost= 10,kernal = "radial", scale = FALSE)
print(model)
##
## Call:
## svm(formula = Smoker ~ ., data = dfev, cost = 10, kernal = "radial",
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
## gamma: 0.3333333
##
## Number of Support Vectors: 197
#預測:x與y是在對正確答案
presult <- predict(model,x)
confusion matrix
confusionMatrix(presult,y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Current Non
## Current 31 5
## Non 34 584
##
## Accuracy : 0.9404
## 95% CI : (0.9194, 0.9573)
## No Information Rate : 0.9006
## P-Value [Acc > NIR] : 0.0001926
##
## Kappa : 0.5844
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.47692
## Specificity : 0.99151
## Pos Pred Value : 0.86111
## Neg Pred Value : 0.94498
## Prevalence : 0.09939
## Detection Rate : 0.04740
## Detection Prevalence : 0.05505
## Balanced Accuracy : 0.73422
##
## 'Positive' Class : Current
##
結論:由於此筆資料,雖然未抽菸者在整筆資料裡面明顯占有多數,但此模型預測的準確率有高達94%,精準的預測出抽菸與未抽菸的人