在這份範例中,我們會示範如何使用一些統計上的技巧來分析你的資料。
該資料集內包含了4000多位學生在數學及科學方面的成績、興趣、投入時間,以及家長教育背景、教育資源量
dta <- read.table(file = "data/TIMSS2011TW.txt",
header = TRUE)
str(dta)
## 'data.frame': 4467 obs. of 13 variables:
## $ gender : Factor w/ 2 levels "boy","girl": 2 2 2 2 2 2 2 2 2 2 ...
## $ math : num 729 776 718 607 658 ...
## $ math.interest : num 8.93 13.47 9.6 13.47 8.27 ...
## $ math.evaluation : num 9.34 9.34 10.35 10.35 8.21 ...
## $ math.input : num 9.16 12.42 10.15 8.71 7.86 ...
## $ math.hours : Factor w/ 3 levels "<= 45min",">= 3hours",..: 3 1 1 1 3 1 1 3 1 1 ...
## $ science : num 683 663 667 575 650 ...
## $ science.interest : num 7.65 7.65 9.42 9.42 9.04 ...
## $ science.evaluation : num 9.16 6.53 11.09 7.6 8.23 ...
## $ science.input : num 8.3 7.92 9.55 7.92 8.7 ...
## $ science.hours : Factor w/ 3 levels "<= 45min",">= 3hours",..: 1 1 1 1 1 1 1 1 1 3 ...
## $ parental.education : Factor w/ 5 levels "college","elementary school",..: 3 3 2 4 5 1 5 4 3 3 ...
## $ educational.resources: num 9.6 8.92 6.33 10.25 10.93 ...
head(dta)
## gender math math.interest math.evaluation math.input math.hours
## 1 girl 729.3937 8.93041 9.34439 9.15641 45min - 3hours
## 2 girl 776.1965 13.46507 9.34439 12.42205 <= 45min
## 3 girl 718.1735 9.60333 10.35139 10.15325 <= 45min
## 4 girl 607.1847 13.46507 10.35139 8.70884 <= 45min
## 5 girl 658.1759 8.26761 8.20673 7.85736 45min - 3hours
## 6 girl 478.5763 6.36452 7.27410 7.85736 <= 45min
## science science.interest science.evaluation science.input science.hours
## 1 682.7541 7.64598 9.15956 8.30491 <= 45min
## 2 663.3682 7.64598 6.52660 7.91938 <= 45min
## 3 667.1151 9.41832 11.09147 9.54897 <= 45min
## 4 575.0923 9.41832 7.60129 7.91938 <= 45min
## 5 649.7578 9.03893 8.23142 8.69954 <= 45min
## 6 491.3467 12.94370 9.15956 10.57281 <= 45min
## parental.education educational.resources
## 1 high school 9.60097
## 2 high school 8.91919
## 3 elementary school 6.33067
## 4 junior high school 10.25396
## 5 university above 10.92551
## 6 college 10.92551
summary(dta)
## gender math math.interest math.evaluation
## boy :2225 Min. :166.4 Min. : 5.037 Min. : 3.412
## girl:2242 1st Qu.:556.0 1st Qu.: 7.912 1st Qu.: 7.274
## Median :629.9 Median : 8.930 Median : 8.207
## Mean :618.1 Mean : 9.079 Mean : 8.357
## 3rd Qu.:687.6 3rd Qu.: 9.965 3rd Qu.: 9.344
## Max. :918.1 Max. :13.465 Max. :13.707
## math.input math.hours science science.interest
## Min. : 3.265 <= 45min :1628 Min. :178.8 Min. : 4.513
## 1st Qu.: 7.857 >= 3hours : 803 1st Qu.:517.9 1st Qu.: 7.989
## Median : 8.584 45min - 3hours:2036 Median :577.5 Median : 9.039
## Mean : 8.583 Mean :570.6 Mean : 9.010
## 3rd Qu.: 9.635 3rd Qu.:627.2 3rd Qu.: 9.827
## Max. :14.343 Max. :844.3 Max. :12.944
## science.evaluation science.input science.hours
## Min. : 4.136 Min. : 3.556 <= 45min :2396
## 1st Qu.: 7.601 1st Qu.: 7.524 >= 3hours : 274
## Median : 8.540 Median : 8.700 45min - 3hours:1797
## Mean : 8.556 Mean : 8.613
## 3rd Qu.: 9.479 3rd Qu.: 9.549
## Max. :13.103 Max. :13.833
## parental.education educational.resources
## college : 672 Min. : 4.323
## elementary school : 78 1st Qu.: 9.601
## high school :2021 Median :10.254
## junior high school: 449 Mean :10.486
## university above :1247 3rd Qu.:11.649
## Max. :14.018
require(ggplot2)
## Loading required package: ggplot2
#將底下的圖設定為黑白配色(theme_bw)
old <- theme_set(theme_bw())
ggplot(data = dta, aes(x = gender, y = math)) +
geom_boxplot() + coord_flip() +
labs( y = 'math', x = 'gender',
title = 'Mathematical Score Box')
#以下函式計算95%信賴區間
with(dta,
tapply(math, gender,
function(x)
c(mean(x) + c(-2, 2) * sd(x)/sqrt(length(x)))))
## $boy
## [1] 612.0913 621.0584
##
## $girl
## [1] 615.4899 623.5705
#此函數會預設進行 Welch 校正,以處理兩樣本變異數不相同的問題
t.test(math ~ gender, data = dta)
##
## Welch Two Sample t-test
##
## data: math by gender
## t = -0.97932, df = 4414, p-value = 0.3275
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.871550 2.960942
## sample estimates:
## mean in group boy mean in group girl
## 616.5749 619.5302
#可加上參數 var.equal=TRUE 來假設變異數同值(不做Welch校正)
t.test(math ~ gender, data = dta, var.equal = TRUE)
##
## Two Sample t-test
##
## data: math by gender
## t = -0.97969, df = 4465, p-value = 0.3273
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.869280 2.958673
## sample estimates:
## mean in group boy mean in group girl
## 616.5749 619.5302
library(Hmisc)
#先把父母教育欄位內各個水準順序定下來(order of factors)
dta$parental.education <- factor(dta$parental.education,
levels = c('elementary school',
'junior high school',
'high school',
'college',
'university above'))
#看不同父母教育程度下的數學分數平均數
tapply(dta$math, dta$parental.education, mean)
## elementary school junior high school high school
## 536.5940 558.7106 598.8742
## college university above
## 645.2816 660.9434
#同父母教育程度下的數學分數平均數,加上信賴區間
ggplot(data = dta,
aes(x = parental.education, y = math)) +
stat_summary(fun.data = 'mean_cl_boot', size = 1) +
scale_y_continuous(breaks = seq(500, 660, by = 20)) +
geom_hline(yintercept = mean(dta$math) ,
linetype = 'dotted') +
labs(x = '父母教育', y = '數學平均分數') +
coord_flip()
anova(m1 <- lm(math ~ parental.education, data = dta))
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## parental.education 4 5634301 1408575 158.12 < 2.2e-16 ***
## Residuals 4462 39748578 8908
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data = dta,
aes(group = parental.education,
y = math, x = educational.resources)) +
geom_point() +
stat_smooth(method = 'lm', se = F) +
stat_smooth(aes(group = parental.education,
y = math, x = educational.resources),
method = 'lm', se = F) +
facet_grid( . ~ parental.education) +
labs(x = '教育資源', y = '數學分數')
#把教育資源加進模型
anova(m2 <- update(m1, . ~ . +
educational.resources, data = dta))
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## parental.education 4 5634301 1408575 168.57 < 2.2e-16 ***
## educational.resources 1 2471927 2471927 295.82 < 2.2e-16 ***
## Residuals 4461 37276651 8356
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#或許不是父母教育而是教育資源造成
anova(m3 <- update(m2, . ~ . - parental.education, data = dta))
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## educational.resources 1 7799321 7799321 926.57 < 2.2e-16 ***
## Residuals 4465 37583557 8417
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#將結果放在一個list中
res_lm <- lapply(list(m1, m2, m3), summary)
#比較在控制教育資源下,父母教育的效果
(res_lm[[2]]$r.sq - res_lm[[3]]$r.sq)/res_lm[[2]]$r.sq
## [1] 0.03786054
anova(m3, m2)
## Analysis of Variance Table
##
## Model 1: math ~ educational.resources
## Model 2: math ~ parental.education + educational.resources
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4465 37583557
## 2 4461 37276651 4 306906 9.1821 2.192e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#比較在控制父母教育下,教育資源的效果
(res_lm[[2]]$r.sq - res_lm[[1]]$r.sq)/res_lm[[1]]$r.sq
## [1] 0.4387283
anova(m1, m2)
## Analysis of Variance Table
##
## Model 1: math ~ parental.education
## Model 2: math ~ parental.education + educational.resources
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4462 39748578
## 2 4461 37276651 1 2471927 295.82 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
require(coefplot)
m2 <- lm(math ~ parental.education+educational.resources- 1,
data = dta)
coefplot(m2, xlab = '估計值', ylab = '迴歸變項', title = '反應變項 = 數學分數')
fit_m2 <- data.frame(dta[, c(2, 12, 13)], fitted = fitted(m2), resid = resid(m2),
infl = influence(m2)$hat )
ggplot(data = fit_m2, aes(x = math, group = parental.education )) +
stat_density(geom = 'path', position = 'identity') +
stat_density(geom = 'path', position = 'identity', aes(x = fitted)) +
geom_vline(xintercept = c(with(dta, tapply(math,parental.education, mean))), linetype = 'dotted')+
facet_grid(parental.education ~ .) +
scale_x_continuous(breaks = seq(200, 900, by = 100))+
labs(x = '數學分數', y = '機率密度')
ggplot(data = fit_m2, aes(x = scale(resid)), group = parental.education ) +
stat_density(geom = 'path', position = 'identity', aes(linetype = parental.education)) +
scale_linetype_manual(values = 5:1) +
guides(linetype = guide_legend(reverse = TRUE)) +
labs(x = '標準化殘差', y = '機率密度') +
theme(legend.position = c(.15, .8))
require(lattice)
qqmath(~ scale(resid) | parental.education, data = fit_m2, type = c('p', 'g', 'r'),
xlab = '常態位數', ylab = '標準化殘差', layout = c(2, 3),
pch = '.', cex = 2)
require(MASS)
ggplot(data = fit_m2, aes(x = fitted, y = scale(resid), group = parental.education )) +
geom_point(pch = 20, size = 1) +
stat_smooth(method = 'rlm', se = F) +
facet_grid(parental.education ~ .) +
labs(x = '數學預測值', y = '標準化殘差')
ggplot(data = fit_m2, aes(x = infl, y = scale(resid), group = parental.education)) +
geom_text(aes(label = rownames(fit_m2)), cex = 2) +
geom_hline(yintercept = 0, linetype = 'dotted') +
facet_grid(parental.education ~ .) +
labs(x = '影響值', y = '標準化殘差')
summary(influence(m2)$hat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0004960 0.0006413 0.0008784 0.0013432 0.0015737 0.0149093
theme_set(old)
dta_math <- dta[, c('math', 'math.interest',
'math.evaluation', 'math.input')]
colMeans(dta_math)
## math math.interest math.evaluation math.input
## 618.058149 9.079051 8.356993 8.583309
require(heplots)
scatterplotMatrix(~ math + math.interest + math.evaluation + math.input, data= dta_math,
pch = '.', cex = 3, smooth = FALSE, reg.line = FALSE, ellipse = TRUE,
diagonal = 'none', lower.panel = NULL)
require(corrplot)
corrplot(cor(dta_math), method = 'ellipse', order = 'hclust', addrect = 4,
type = 'upper', tl.pos = 'tp')
corrplot(cor(dta_math), add = TRUE, type = 'lower', method = 'number',
order = 'hclust', col = 'black', diag = FALSE, tl.pos = 'n', cl.pos = 'n')
summary(m4 <- lm(math ~ math.interest + math.evaluation + math.input, data = dta_math))
##
## Call:
## lm(formula = math ~ math.interest + math.evaluation + math.input,
## data = dta_math)
##
## Residuals:
## Min 1Q Median 3Q Max
## -427.11 -54.39 6.97 61.36 277.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 383.4531 7.4041 51.790 < 2e-16 ***
## math.interest 16.7783 0.9436 17.780 < 2e-16 ***
## math.evaluation 7.0554 0.9501 7.426 1.33e-13 ***
## math.input 2.7160 1.0774 2.521 0.0117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 88.76 on 4463 degrees of freedom
## Multiple R-squared: 0.2252, Adjusted R-squared: 0.2247
## F-statistic: 432.5 on 3 and 4463 DF, p-value: < 2.2e-16
coefplot(m4, predictors = c('math.interest', 'math.evaluation',
'math.input'),
xlab = '估計值', ylab = '迴歸變項(去除截距)', title = '反應變項是數學分數')
require(effects)
plot(allEffects(m4), main = '', ylim = c(550, 670), grid = T)
library(lm.beta)
summary(lm.beta(m4))
##
## Call:
## lm(formula = math ~ math.interest + math.evaluation + math.input,
## data = dta_math)
##
## Residuals:
## Min 1Q Median 3Q Max
## -427.11 -54.39 6.97 61.36 277.72
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 383.45307 0.00000 7.40407 51.790 < 2e-16 ***
## math.interest 16.77826 0.34905 0.94365 17.780 < 2e-16 ***
## math.evaluation 7.05539 0.12893 0.95007 7.426 1.33e-13 ***
## math.input 2.71604 0.04592 1.07742 2.521 0.0117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 88.76 on 4463 degrees of freedom
## Multiple R-squared: 0.2252, Adjusted R-squared: 0.2247
## F-statistic: 432.5 on 3 and 4463 DF, p-value: < 2.2e-16
summary(m5 <- update(m4, . ~ . - math.input , data = dta_math))
##
## Call:
## lm(formula = math ~ math.interest + math.evaluation, data = dta_math)
##
## Residuals:
## Min 1Q Median 3Q Max
## -423.64 -54.22 6.82 61.30 281.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 391.5857 6.6683 58.724 < 2e-16 ***
## math.interest 17.9769 0.8156 22.042 < 2e-16 ***
## math.evaluation 7.5696 0.9285 8.153 4.58e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 88.81 on 4464 degrees of freedom
## Multiple R-squared: 0.2241, Adjusted R-squared: 0.2238
## F-statistic: 644.8 on 2 and 4464 DF, p-value: < 2.2e-16
anova(m5, m4)
## Analysis of Variance Table
##
## Model 1: math ~ math.interest + math.evaluation
## Model 2: math ~ math.interest + math.evaluation + math.input
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4464 35211080
## 2 4463 35161015 1 50065 6.3548 0.01174 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m5 <- lm(math ~ math.interest + math.hours + math.evaluation + educational.resources, data = dta)
fit_m5 <- data.frame(dta[, c(2, 3, 6, 4, 13)], fitted = fitted(m5), resid = resid(m5), infl = influence(m5)$hat)
ggplot(data = fit_m5, aes(x = math, group = math.hours )) +
stat_density(geom = 'path', position = 'identity') +
stat_density(geom = 'path', position = 'identity', aes(x = fitted)) +
geom_vline(xintercept = c(with(dta, tapply(math, math.hours, mean))), linetype = 'dotted')+
facet_grid(math.hours ~ .) +
scale_x_continuous(breaks = seq(200, 900, by = 100))+
labs(x = '數學分數', y = '機率密度')