library(e1071)
library(ggplot2)
library(stats)
library(jpeg)
本次作業使用basketball-reference所記錄之NBA2017-18賽季球隊進階數據,並且擷取了東區15支球隊於2017/11/17前的所有比賽,共212場比賽數據及21個變數。
NBA Reference:https://www.basketball-reference.com/
nba <- read.csv("hw5data.csv")
#查看資料長相
head(nba)
## Team home Opp W.L Tm Opp.1 ORtg DRtg Pace FTr X3PAr TS. TRB. AST.
## 1 BOS 1 CLE 0 99 102 99.7 102.7 99.3 0.284 0.364 0.500 47.9 66.7
## 2 BOS 0 MIL 0 100 108 102.2 110.4 97.9 0.231 0.308 0.499 48.9 59.0
## 3 BOS 1 PHI 1 102 92 102.5 92.4 99.5 0.381 0.345 0.520 53.9 45.7
## 4 BOS 0 NYK 1 110 89 116.4 94.2 94.5 0.360 0.387 0.633 48.8 70.3
## 5 BOS 1 MIL 1 96 89 99.9 92.6 96.1 0.282 0.385 0.547 54.9 63.6
## 6 BOS 1 MIA 1 96 90 102.4 96.0 93.7 0.321 0.284 0.519 50.6 44.1
## STL. BLK. eFG. TOV. ORB. FT.FGA btob
## 1 11.1 6.6 0.455 9.2 18.0 0.216 0
## 2 12.3 3.6 0.489 10.7 22.0 0.121 1
## 3 4.0 8.3 0.476 16.2 29.8 0.262 0
## 4 11.6 9.6 0.587 12.1 13.5 0.293 0
## 5 7.3 7.7 0.494 14.6 14.6 0.244 0
## 6 11.7 10.4 0.457 11.5 24.4 0.272 0
#觀察資料維度
dim(nba)
## [1] 212 21
#各隊進行的比賽數
table(nba$Team)
##
## ATL BOS BRK CHI CHO CLE DET IND MIA MIL NYK ORL PHI TOR WAS
## 15 15 14 12 13 15 14 15 14 14 14 15 14 14 14
我將資料中所有的比賽分為3群,希望能藉由kmeans法所分出的比賽群集進一步挖掘資料中有趣的現象。
我將分群結果的場次勝負分平均加以計算後,可以看出群一代表大部分贏球的比賽,群二則是代表大部分輸球的比賽,群三由勝負分來看沒那麼容易區分,或許有其他變數決定了這些比賽場次的分群。
kk <- nba[,c(2,7:21)]
set.seed(34)
kmeansOut <- kmeans(kk, 3, nstart = 50)
nba$cluster <- kmeansOut$cluster
nba$winloss <- nba$Tm - nba$Opp.1
#各群的勝負分數平均
tapply(nba$winloss,nba$cluster,mean)
## 1 2 3
## 11.2236842 -13.1692308 0.4507042
#各群勝負分分布圖
ggplot(nba,aes(x=as.factor(cluster),y=winloss))+geom_boxplot()+labs(x='群集',y='勝負分')
這裡我抓了兩個變數進行ANOVA分析,判斷由kmeans分出的群集當中這些變數是否有差異。
首先抓出Pace,也就是比賽的節奏。從p-value=0.847不拒絕H0的結果,推論比賽節奏的快慢並不是贏球或輸球的保證,可能有球隊打快節奏仍然常敗,習慣慢節奏的球隊仍有自己的取勝之道。
接著我抓取TRB%,也就是籃板率。從極小的p-value推論不同群間的籃板率有顯著差異,並且勝分高的比賽普遍有著較高的籃板率,輸球的比賽籃板率亦相對較低,因此驗證了灌籃高手當中,赤木隊長的經典名言:「掌握籃板球的人,便能夠主宰比賽!」
#檢驗每群的pace是否相同
anova(m1 <- lm(cluster~Pace,data=nba))
## Analysis of Variance Table
##
## Response: cluster
## Df Sum Sq Mean Sq F value Pr(>F)
## Pace 1 0.026 0.02609 0.0373 0.847
## Residuals 210 146.856 0.69931
#檢驗每群的籃板率是否相同
anova(m1 <- lm(cluster~TRB.,data=nba))
## Analysis of Variance Table
##
## Response: cluster
## Df Sum Sq Mean Sq F value Pr(>F)
## TRB. 1 5.707 5.7071 8.4894 0.003959 **
## Residuals 210 141.175 0.6723
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(nba,aes(x=as.factor(cluster),y=TRB.))+geom_boxplot()+labs(x='群集',y='籃板率')
進一步地,我建立所有變數的相關矩陣,希望能找出各變數的關聯。
cor(nba[,c(2,4,7:21)])
## home W.L ORtg DRtg Pace
## home 1.0000000000 -0.14411401 -0.08599612 0.05700409 0.06496680
## W.L -0.1441140110 1.00000000 0.50950547 -0.49192883 -0.06419789
## ORtg -0.0859961241 0.50950547 1.00000000 0.24789098 -0.06828279
## DRtg 0.0570040912 -0.49192883 0.24789098 1.00000000 0.01636178
## Pace 0.0649668028 -0.06419789 -0.06828279 0.01636178 1.00000000
## FTr -0.0095972774 0.16320737 0.11428123 0.03248178 0.25063939
## X3PAr 0.0322678804 -0.04965557 0.01912462 -0.04424711 -0.07252890
## TS. -0.0139860054 0.44818763 0.86140190 0.19490245 -0.01200980
## TRB. -0.0905666503 0.54162906 0.40229659 -0.35028246 -0.07087659
## AST. 0.0447419970 0.02224883 0.05653580 -0.05137673 -0.06581930
## STL. 0.1196885706 0.14375382 0.03903943 -0.17782101 -0.06159778
## BLK. -0.2075757244 0.17527943 0.03856410 -0.14571468 0.03101238
## eFG. -0.0221204667 0.40235378 0.81841800 0.18188536 -0.03757162
## TOV. 0.1405207753 -0.07176524 -0.19390137 -0.11600480 0.04029083
## ORB. -0.0858760737 0.28507403 0.42031931 -0.01103617 -0.10412203
## FT.FGA -0.0002376905 0.20356603 0.19099851 0.05532316 0.19359745
## btob 0.1110344098 -0.10327154 -0.06700393 0.13352992 0.01702874
## FTr X3PAr TS. TRB. AST.
## home -0.009597277 0.032267880 -0.01398601 -0.09056665 0.04474200
## W.L 0.163207366 -0.049655574 0.44818763 0.54162906 0.02224883
## ORtg 0.114281233 0.019124624 0.86140190 0.40229659 0.05653580
## DRtg 0.032481776 -0.044247106 0.19490245 -0.35028246 -0.05137673
## Pace 0.250639388 -0.072528904 -0.01200980 -0.07087659 -0.06581930
## FTr 1.000000000 -0.071508438 0.13282696 0.05728196 -0.20902443
## X3PAr -0.071508438 1.000000000 0.09120319 -0.01962886 0.37972625
## TS. 0.132826957 0.091203187 1.00000000 0.32120813 0.09817769
## TRB. 0.057281965 -0.019628855 0.32120813 1.00000000 0.02638939
## AST. -0.209024427 0.379726249 0.09817769 0.02638939 1.00000000
## STL. -0.123482253 -0.004446974 0.02523191 -0.07342416 0.13768464
## BLK. 0.067225606 -0.091998401 0.01789936 0.12892810 -0.06255953
## eFG. -0.064833120 0.081256504 0.95967827 0.31249101 0.15403386
## TOV. 0.006439197 0.185543394 0.18985883 0.16708322 0.08047369
## ORB. 0.004290680 -0.049765425 0.12411873 0.64701853 -0.02198255
## FT.FGA 0.938023554 -0.019087282 0.20672446 0.06075201 -0.20968733
## btob -0.050526162 -0.038813660 -0.03226728 -0.07574530 -0.04304214
## STL. BLK. eFG. TOV. ORB.
## home 0.119688571 -0.20757572 -0.02212047 0.140520775 -0.08587607
## W.L 0.143753817 0.17527943 0.40235378 -0.071765237 0.28507403
## ORtg 0.039039429 0.03856410 0.81841800 -0.193901372 0.42031931
## DRtg -0.177821007 -0.14571468 0.18188536 -0.116004805 -0.01103617
## Pace -0.061597782 0.03101238 -0.03757162 0.040290833 -0.10412203
## FTr -0.123482253 0.06722561 -0.06483312 0.006439197 0.00429068
## X3PAr -0.004446974 -0.09199840 0.08125650 0.185543394 -0.04976542
## TS. 0.025231908 0.01789936 0.95967827 0.189858834 0.12411873
## TRB. -0.073424162 0.12892810 0.31249101 0.167083223 0.64701853
## AST. 0.137684639 -0.06255953 0.15403386 0.080473691 -0.02198255
## STL. 1.000000000 0.01673553 0.04242285 0.040743403 0.12223279
## BLK. 0.016735531 1.00000000 0.02779618 0.008541820 0.10436471
## eFG. 0.042422854 0.02779618 1.00000000 0.190274156 0.10847323
## TOV. 0.040743403 0.00854182 0.19027416 1.000000000 0.03907842
## ORB. 0.122232794 0.10436471 0.10847323 0.039078423 1.00000000
## FT.FGA -0.091291734 0.02892389 -0.04686971 0.010947302 0.02852079
## btob 0.044217876 -0.04559102 -0.03047918 0.101111707 -0.01340233
## FT.FGA btob
## home -0.0002376905 0.11103441
## W.L 0.2035660348 -0.10327154
## ORtg 0.1909985144 -0.06700393
## DRtg 0.0553231596 0.13352992
## Pace 0.1935974512 0.01702874
## FTr 0.9380235537 -0.05052616
## X3PAr -0.0190872815 -0.03881366
## TS. 0.2067244585 -0.03226728
## TRB. 0.0607520075 -0.07574530
## AST. -0.2096873279 -0.04304214
## STL. -0.0912917341 0.04421788
## BLK. 0.0289238930 -0.04559102
## eFG. -0.0468697087 -0.03047918
## TOV. 0.0109473022 0.10111171
## ORB. 0.0285207901 -0.01340233
## FT.FGA 1.0000000000 -0.04036072
## btob -0.0403607158 1.00000000
從上述的correlation matrix結果中,我想先挑出兩個與勝負最相關的變數進行二維SVM的建模,希望建出一個能精準分類勝負的模型,因此要取用與勝負相關性高的變數。於是我採用ORtg(進攻效率)跟DRtg(防守效率)兩個變數,並且取用了170筆資料作為train data,41筆資料當作test data,完成的結果如附圖及附表。
traindata1 <- nba[1:170,c(4,6,7)]
testdata1 <- nba[171:212, c(6,7)]
svmfit = svm(as.factor(W.L) ~ ., data = traindata1,
kernel = "polynomial",
cost = 10, scale = FALSE)
#SVM分類圖
plot(svmfit, traindata1)
predict1 = predict(svmfit, testdata1)
#正確度表格
ans1 = table(predict1, nba[171:212, 4])
print(ans1)
##
## predict1 0 1
## 0 25 4
## 1 3 10
#此模型正確率
acc1 = (ans1[1,1]+ans1[2,2])/sum(ans1)
print(acc1)
## [1] 0.8333333
從以上結果可以得知模型精準度達0.833,已經是足夠精確的模型,接著繼續嘗試高維度的SVM,我將網站中截取到的所有進階數據代入模型中,得到了100%的精確度,由此可知這些進階數據已經能完美的詮釋比賽勝負,是能夠參考及解讀的數據。
traindata2 <- nba[1:170,c(4,7:21)]
testdata2 <- nba[171:212, c(7:21)]
svmfit2 = svm(as.factor(W.L) ~ ., data = traindata2,
kernel = "polynomial",
cost = 10, scale = FALSE)
predict2 = predict(svmfit2, testdata2)
#正確度表格
ans2 = table(predict2, nba[171:212, 4])
print(ans2)
##
## predict2 0 1
## 0 28 0
## 1 0 14
acc2 = (ans2[1,1]+ans2[2,2])/sum(ans2)
#此模型正確率
print(acc2)
## [1] 1