Regression Example

在這份範例中,我們會示範如何使用一些統計上的技巧來分析你的資料。

首先我們讀入來自TIMSS 的 2011 年台灣資料

該資料集內包含了4000多位學生在數學及科學方面的成績、興趣、投入時間,以及家長教育背景、教育資源量

dta <- read.table(file = "data/TIMSS2011TW.txt", 
                  header = TRUE)

來看看它的資料結構和一些基本統計

str(dta)
## 'data.frame':    4467 obs. of  13 variables:
##  $ gender               : Factor w/ 2 levels "boy","girl": 2 2 2 2 2 2 2 2 2 2 ...
##  $ math                 : num  729 776 718 607 658 ...
##  $ math.interest        : num  8.93 13.47 9.6 13.47 8.27 ...
##  $ math.evaluation      : num  9.34 9.34 10.35 10.35 8.21 ...
##  $ math.input           : num  9.16 12.42 10.15 8.71 7.86 ...
##  $ math.hours           : Factor w/ 3 levels "<= 45min",">= 3hours",..: 3 1 1 1 3 1 1 3 1 1 ...
##  $ science              : num  683 663 667 575 650 ...
##  $ science.interest     : num  7.65 7.65 9.42 9.42 9.04 ...
##  $ science.evaluation   : num  9.16 6.53 11.09 7.6 8.23 ...
##  $ science.input        : num  8.3 7.92 9.55 7.92 8.7 ...
##  $ science.hours        : Factor w/ 3 levels "<= 45min",">= 3hours",..: 1 1 1 1 1 1 1 1 1 3 ...
##  $ parental.education   : Factor w/ 5 levels "college","elementary school",..: 3 3 2 4 5 1 5 4 3 3 ...
##  $ educational.resources: num  9.6 8.92 6.33 10.25 10.93 ...
head(dta)
##   gender     math math.interest math.evaluation math.input     math.hours
## 1   girl 729.3937       8.93041         9.34439    9.15641 45min - 3hours
## 2   girl 776.1965      13.46507         9.34439   12.42205       <= 45min
## 3   girl 718.1735       9.60333        10.35139   10.15325       <= 45min
## 4   girl 607.1847      13.46507        10.35139    8.70884       <= 45min
## 5   girl 658.1759       8.26761         8.20673    7.85736 45min - 3hours
## 6   girl 478.5763       6.36452         7.27410    7.85736       <= 45min
##    science science.interest science.evaluation science.input science.hours
## 1 682.7541          7.64598            9.15956       8.30491      <= 45min
## 2 663.3682          7.64598            6.52660       7.91938      <= 45min
## 3 667.1151          9.41832           11.09147       9.54897      <= 45min
## 4 575.0923          9.41832            7.60129       7.91938      <= 45min
## 5 649.7578          9.03893            8.23142       8.69954      <= 45min
## 6 491.3467         12.94370            9.15956      10.57281      <= 45min
##   parental.education educational.resources
## 1        high school               9.60097
## 2        high school               8.91919
## 3  elementary school               6.33067
## 4 junior high school              10.25396
## 5   university above              10.92551
## 6            college              10.92551
summary(dta)
##   gender          math       math.interest    math.evaluation 
##  boy :2225   Min.   :166.4   Min.   : 5.037   Min.   : 3.412  
##  girl:2242   1st Qu.:556.0   1st Qu.: 7.912   1st Qu.: 7.274  
##              Median :629.9   Median : 8.930   Median : 8.207  
##              Mean   :618.1   Mean   : 9.079   Mean   : 8.357  
##              3rd Qu.:687.6   3rd Qu.: 9.965   3rd Qu.: 9.344  
##              Max.   :918.1   Max.   :13.465   Max.   :13.707  
##    math.input              math.hours      science      science.interest
##  Min.   : 3.265   <= 45min      :1628   Min.   :178.8   Min.   : 4.513  
##  1st Qu.: 7.857   >= 3hours     : 803   1st Qu.:517.9   1st Qu.: 7.989  
##  Median : 8.584   45min - 3hours:2036   Median :577.5   Median : 9.039  
##  Mean   : 8.583                         Mean   :570.6   Mean   : 9.010  
##  3rd Qu.: 9.635                         3rd Qu.:627.2   3rd Qu.: 9.827  
##  Max.   :14.343                         Max.   :844.3   Max.   :12.944  
##  science.evaluation science.input           science.hours 
##  Min.   : 4.136     Min.   : 3.556   <= 45min      :2396  
##  1st Qu.: 7.601     1st Qu.: 7.524   >= 3hours     : 274  
##  Median : 8.540     Median : 8.700   45min - 3hours:1797  
##  Mean   : 8.556     Mean   : 8.613                        
##  3rd Qu.: 9.479     3rd Qu.: 9.549                        
##  Max.   :13.103     Max.   :13.833                        
##           parental.education educational.resources
##  college           : 672     Min.   : 4.323       
##  elementary school :  78     1st Qu.: 9.601       
##  high school       :2021     Median :10.254       
##  junior high school: 449     Mean   :10.486       
##  university above  :1247     3rd Qu.:11.649       
##                              Max.   :14.018

對資料有些了解以後我們載入 ggplot2 準備畫圖

require(ggplot2)
## Loading required package: ggplot2
#將底下的圖設定為黑白配色(theme_bw)
old <- theme_set(theme_bw())

Part A. 性別 vs 數學成績


首先從不同性別的數學分數盒鬚圖開始,並計算其信賴區間

ggplot(data = dta, aes(x = gender, y = math)) +
 geom_boxplot() + coord_flip() +
 labs( y = 'math', x = 'gender', 
       title = 'Mathematical Score Box')

#以下函式計算95%信賴區間
with(dta, 
     tapply(math, gender,
     function(x) 
       c(mean(x) + c(-2, 2) * sd(x)/sqrt(length(x)))))
## $boy
## [1] 612.0913 621.0584
## 
## $girl
## [1] 615.4899 623.5705

以下用 t-test 檢驗不同性別是否存在數學成績差異

#此函數會預設進行 Welch 校正,以處理兩樣本變異數不相同的問題
t.test(math ~ gender, data = dta)
## 
##  Welch Two Sample t-test
## 
## data:  math by gender
## t = -0.97932, df = 4414, p-value = 0.3275
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.871550  2.960942
## sample estimates:
##  mean in group boy mean in group girl 
##           616.5749           619.5302
#可加上參數 var.equal=TRUE 來假設變異數同值(不做Welch校正)
t.test(math ~ gender, data = dta, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  math by gender
## t = -0.97969, df = 4465, p-value = 0.3273
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.869280  2.958673
## sample estimates:
##  mean in group boy mean in group girl 
##           616.5749           619.5302

Part B. 父母教育 & 教育資源量 vs 數學成績


以下觀察父母的教育背景是否與數學成績有關

library(Hmisc)
#先把父母教育欄位內各個水準順序定下來(order of factors)
dta$parental.education <- factor(dta$parental.education, 
                       levels = c('elementary school',
                                  'junior high school',
                                  'high school',
                                  'college', 
                                  'university above'))

#看不同父母教育程度下的數學分數平均數
tapply(dta$math, dta$parental.education, mean)
##  elementary school junior high school        high school 
##           536.5940           558.7106           598.8742 
##            college   university above 
##           645.2816           660.9434
#同父母教育程度下的數學分數平均數,加上信賴區間
ggplot(data = dta, 
       aes(x = parental.education, y = math)) +
  stat_summary(fun.data = 'mean_cl_boot', size = 1) +
  scale_y_continuous(breaks = seq(500, 660, by = 20)) +
  geom_hline(yintercept = mean(dta$math) , 
             linetype = 'dotted') +
  labs(x = '父母教育', y = '數學平均分數') +
  coord_flip()

在這裡我們推測父母教育的效果可能是教育資源差距造成的,畫圖觀察看看

anova(m1 <- lm(math ~ parental.education, data = dta))
## Analysis of Variance Table
## 
## Response: math
##                      Df   Sum Sq Mean Sq F value    Pr(>F)    
## parental.education    4  5634301 1408575  158.12 < 2.2e-16 ***
## Residuals          4462 39748578    8908                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data = dta, 
       aes(group = parental.education, 
          y = math, x = educational.resources)) +
  geom_point() +
  stat_smooth(method = 'lm', se = F) +
  stat_smooth(aes(group = parental.education, 
          y = math, x = educational.resources), 
          method = 'lm', se = F) + 
  facet_grid( . ~  parental.education) +
  labs(x = '教育資源', y = '數學分數')

利用以下 ANOVA 檢驗假設是否正確

#把教育資源加進模型
anova(m2 <- update(m1, . ~ . + 
            educational.resources, data = dta))
## Analysis of Variance Table
## 
## Response: math
##                         Df   Sum Sq Mean Sq F value    Pr(>F)    
## parental.education       4  5634301 1408575  168.57 < 2.2e-16 ***
## educational.resources    1  2471927 2471927  295.82 < 2.2e-16 ***
## Residuals             4461 37276651    8356                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#或許不是父母教育而是教育資源造成
anova(m3 <- update(m2, . ~ . - parental.education,  data = dta))
## Analysis of Variance Table
## 
## Response: math
##                         Df   Sum Sq Mean Sq F value    Pr(>F)    
## educational.resources    1  7799321 7799321  926.57 < 2.2e-16 ***
## Residuals             4465 37583557    8417                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

將 ANOVA 結果做成圖表輸出,先計算一些需要的數據

#將結果放在一個list中
res_lm <- lapply(list(m1, m2, m3), summary)
#比較在控制教育資源下,父母教育的效果
(res_lm[[2]]$r.sq - res_lm[[3]]$r.sq)/res_lm[[2]]$r.sq
## [1] 0.03786054
anova(m3, m2)
## Analysis of Variance Table
## 
## Model 1: math ~ educational.resources
## Model 2: math ~ parental.education + educational.resources
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4465 37583557                                  
## 2   4461 37276651  4    306906 9.1821 2.192e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#比較在控制父母教育下,教育資源的效果
(res_lm[[2]]$r.sq - res_lm[[1]]$r.sq)/res_lm[[1]]$r.sq
## [1] 0.4387283
anova(m1, m2)
## Analysis of Variance Table
## 
## Model 1: math ~ parental.education
## Model 2: math ~ parental.education + educational.resources
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4462 39748578                                  
## 2   4461 37276651  1   2471927 295.82 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

正式畫圖

require(coefplot)
m2 <- lm(math ~ parental.education+educational.resources- 1, 
         data = dta)
coefplot(m2, xlab = '估計值', ylab = '迴歸變項', title = '反應變項 = 數學分數')

把資料與迴歸分析的預測值、殘差與影響度放進資料

fit_m2 <- data.frame(dta[, c(2, 12, 13)], fitted = fitted(m2), resid = resid(m2),
                     infl = influence(m2)$hat )

依父母教育疊合真實觀測值與預測值

ggplot(data = fit_m2, aes(x = math, group = parental.education )) +
 stat_density(geom = 'path', position = 'identity') +
 stat_density(geom = 'path', position = 'identity', aes(x = fitted)) +
 geom_vline(xintercept = c(with(dta, tapply(math,parental.education, mean))), linetype = 'dotted')+
 facet_grid(parental.education ~ .) +
 scale_x_continuous(breaks = seq(200, 900, by = 100))+
 labs(x = '數學分數', y = '機率密度')

看殘差分配,依父母教育,檢視常態與變異數同質假設

ggplot(data = fit_m2, aes(x = scale(resid)), group = parental.education ) +
 stat_density(geom = 'path', position = 'identity', aes(linetype = parental.education)) +
 scale_linetype_manual(values = 5:1) +
 guides(linetype = guide_legend(reverse = TRUE)) +
 labs(x = '標準化殘差', y = '機率密度') +
 theme(legend.position = c(.15, .8))

看看殘差的 Q-Q 圖,依父母教育。檢視常態假設

require(lattice)
qqmath(~ scale(resid) | parental.education, data = fit_m2, type = c('p', 'g', 'r'),
       xlab = '常態位數', ylab = '標準化殘差', layout = c(2, 3),
       pch = '.', cex = 2)

畫預測值與殘差的散佈圖,檢查線性與等分散假設

require(MASS)
ggplot(data = fit_m2, aes(x = fitted, y = scale(resid), group = parental.education )) +
  geom_point(pch = 20, size = 1) +
  stat_smooth(method = 'rlm', se = F) +
  facet_grid(parental.education ~ .) +
  labs(x = '數學預測值', y = '標準化殘差')

呈現影響值(影響估計結果過大的值)與標準化殘差

ggplot(data = fit_m2, aes(x = infl, y = scale(resid), group = parental.education)) +
 geom_text(aes(label = rownames(fit_m2)), cex = 2) +
 geom_hline(yintercept = 0, linetype = 'dotted') +
 facet_grid(parental.education ~ .) +
 labs(x = '影響值', y = '標準化殘差')

看看影響值

summary(influence(m2)$hat)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0004960 0.0006413 0.0008784 0.0013432 0.0015737 0.0149093

記得改回舊的配色,不然 R 就黑白下去了

theme_set(old)

Part C. 接下來分析資料集當中的其他變項是否和數學成績有關


dta_math <- dta[, c('math', 'math.interest', 
                    'math.evaluation', 'math.input')]

看看基本統計量

colMeans(dta_math)
##            math   math.interest math.evaluation      math.input 
##      618.058149        9.079051        8.356993        8.583309

呈現兩兩散佈圖

require(heplots)
scatterplotMatrix(~ math + math.interest + math.evaluation + math.input, data= dta_math,
  pch = '.', cex = 3, smooth = FALSE, reg.line = FALSE, ellipse = TRUE,
  diagonal = 'none', lower.panel = NULL)

我們利用corrplot 套件,以圖形顯示相關性的大小

require(corrplot)
corrplot(cor(dta_math), method = 'ellipse', order = 'hclust', addrect = 4,
         type = 'upper', tl.pos = 'tp')
corrplot(cor(dta_math), add = TRUE, type = 'lower', method = 'number',
         order = 'hclust', col = 'black', diag = FALSE, tl.pos = 'n', cl.pos = 'n')

放進三個解釋變項

summary(m4 <- lm(math ~ math.interest + math.evaluation + math.input, data = dta_math))
## 
## Call:
## lm(formula = math ~ math.interest + math.evaluation + math.input, 
##     data = dta_math)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -427.11  -54.39    6.97   61.36  277.72 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     383.4531     7.4041  51.790  < 2e-16 ***
## math.interest    16.7783     0.9436  17.780  < 2e-16 ***
## math.evaluation   7.0554     0.9501   7.426 1.33e-13 ***
## math.input        2.7160     1.0774   2.521   0.0117 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 88.76 on 4463 degrees of freedom
## Multiple R-squared:  0.2252, Adjusted R-squared:  0.2247 
## F-statistic: 432.5 on 3 and 4463 DF,  p-value: < 2.2e-16

來看看效果如何

coefplot(m4, predictors = c('math.interest', 'math.evaluation', 
                            'math.input'),
 xlab = '估計值', ylab = '迴歸變項(去除截距)', title = '反應變項是數學分數')

require(effects)
plot(allEffects(m4), main = '', ylim = c(550, 670), grid = T)

利用 lm.beta 套件,計算標準化迴歸係數

library(lm.beta)
summary(lm.beta(m4))
## 
## Call:
## lm(formula = math ~ math.interest + math.evaluation + math.input, 
##     data = dta_math)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -427.11  -54.39    6.97   61.36  277.72 
## 
## Coefficients:
##                  Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)     383.45307      0.00000    7.40407  51.790  < 2e-16 ***
## math.interest    16.77826      0.34905    0.94365  17.780  < 2e-16 ***
## math.evaluation   7.05539      0.12893    0.95007   7.426 1.33e-13 ***
## math.input        2.71604      0.04592    1.07742   2.521   0.0117 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 88.76 on 4463 degrees of freedom
## Multiple R-squared:  0.2252, Adjusted R-squared:  0.2247 
## F-statistic: 432.5 on 3 and 4463 DF,  p-value: < 2.2e-16

看看控制數學興趣與數學評價後,數學投入的效果

summary(m5 <- update(m4, . ~ . - math.input , data = dta_math))
## 
## Call:
## lm(formula = math ~ math.interest + math.evaluation, data = dta_math)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -423.64  -54.22    6.82   61.30  281.97 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     391.5857     6.6683  58.724  < 2e-16 ***
## math.interest    17.9769     0.8156  22.042  < 2e-16 ***
## math.evaluation   7.5696     0.9285   8.153 4.58e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 88.81 on 4464 degrees of freedom
## Multiple R-squared:  0.2241, Adjusted R-squared:  0.2238 
## F-statistic: 644.8 on 2 and 4464 DF,  p-value: < 2.2e-16
anova(m5, m4)
## Analysis of Variance Table
## 
## Model 1: math ~ math.interest + math.evaluation
## Model 2: math ~ math.interest + math.evaluation + math.input
##   Res.Df      RSS Df Sum of Sq      F  Pr(>F)  
## 1   4464 35211080                              
## 2   4463 35161015  1     50065 6.3548 0.01174 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

math ~ math.interest + math.hours + math.evaluation + educational.resources

dta[,2] ~ dta[,3] + dta[,6] + dta[,4] + dta[,13]

m5 <- lm(math ~ math.interest + math.hours + math.evaluation + educational.resources, data = dta)
fit_m5 <- data.frame(dta[, c(2, 3, 6, 4, 13)], fitted = fitted(m5), resid = resid(m5), infl = influence(m5)$hat)

ggplot(data = fit_m5, aes(x = math, group = math.hours )) +
 stat_density(geom = 'path', position = 'identity') +
 stat_density(geom = 'path', position = 'identity', aes(x = fitted)) +
 geom_vline(xintercept = c(with(dta, tapply(math, math.hours, mean))), linetype = 'dotted')+
 facet_grid(math.hours ~ .) +
 scale_x_continuous(breaks = seq(200, 900, by = 100))+
 labs(x = '數學分數', y = '機率密度')