library(ggplot2)
library(e1071) #for svm
library(lattice)
library(caret) #for confusionmatrix
在此使用ggplot2內建的dataset:diamonds
data <- as.data.frame(diamonds)
#觀察資料
head(data)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
summary(data)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
#因資料比數過多,且原始資料為按照價格排序,故先亂數取200筆資料作討論
set.seed(1010)
data <- data[sample(nrow(data), 200),]
1.先繪製所有變量的scatter plot,以找出較為相關的變量組合
plot(data)
2.重量(克拉)與價格的關係圖
ggplot(data = data) +
geom_point(aes(x = carat,
y = price,
color = color,
shape = cut))
由圖可看出兩者略為線性關係,但仍不明顯
3.將兩者取log後作圖後,明顯可看出其線性關係
ggplot(data = data) +
geom_point(aes(x = log(carat),
y = log(price),
color = color,
shape = cut))
4.體積(xyz)與重量(克拉)之關係圖 兩者之間明顯為線性關係,也就是說鑽石的密度幾乎不變
ggplot(data = data) +
geom_point(aes(x = x*y*z,
y = carat,
color = color,
shape = cut))
5.不同種顏色的鑽石數目比較
qplot(carat, data = diamonds, geom = "histogram",
fill = color,
binwidth = 0.5, xlim = c(0, 3))
## Warning: Removed 32 rows containing non-finite values (stat_bin).
假設檢定對應的虛無假設H0:μ(carat) = μ(price)
anova(a <- lm(carat ~ price, data = data))
## Analysis of Variance Table
##
## Response: carat
## Df Sum Sq Mean Sq F value Pr(>F)
## price 1 33.447 33.447 983.06 < 2.2e-16 ***
## Residuals 198 6.737 0.034
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
檢定的結果 p-value < 2.2e-16 也遠小於 0.05,故拒絕虛無假設
預測鑽石位在哪個價錢區間class
#將價格分成三個價錢區間 class3[0, 1000), class2=[1000, 10000), class1=[10000, 100000)
for (i in 1:nrow(data)) {
p <- data$price[i]
if (p >= 10000) { data$class[i] = 1 }
if (1000 <= p & p < 10000) { data$class[i] = 2 }
if (p < 1000) { data$class[i] = 3 }
}
#選取100筆資料作為訓練資料,另100筆資料做為測試資料
testID = sample(nrow(data), 100, replace = FALSE)
testID
## [1] 62 85 125 137 95 19 121 161 87 52 181 22 154 18 177 109 12
## [18] 183 23 168 130 180 96 66 42 91 77 166 115 55 133 103 25 74
## [35] 31 14 89 132 33 153 54 182 134 186 176 188 128 90 17 21 194
## [52] 82 148 29 28 80 167 108 76 151 169 67 185 163 46 147 71 105
## [69] 138 123 197 144 57 122 190 20 88 170 142 69 100 184 70 64 152
## [86] 160 24 40 200 139 9 114 51 63 118 59 112 156 178 73
x <- subset(data[testID, ], select = -class)
y <- as.factor(data$class[testID])
svm_model =
svmfit = svm(class ~ ., data = data[-testID, ], type = "C-classification") #預設似乎是跑線性迴歸,因此需將type改為分類
pred = predict(svm_model, x)
#畫出混淆矩陣
confusionMatrix(pred, y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 9 0 0
## 2 1 64 1
## 3 0 7 18
##
## Overall Statistics
##
## Accuracy : 0.91
## 95% CI : (0.836, 0.958)
## No Information Rate : 0.71
## P-Value [Acc > NIR] : 1.046e-06
##
## Kappa : 0.8105
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.900 0.9014 0.9474
## Specificity 1.000 0.9310 0.9136
## Pos Pred Value 1.000 0.9697 0.7200
## Neg Pred Value 0.989 0.7941 0.9867
## Prevalence 0.100 0.7100 0.1900
## Detection Rate 0.090 0.6400 0.1800
## Detection Prevalence 0.090 0.6600 0.2500
## Balanced Accuracy 0.950 0.9162 0.9305
由混淆矩陣可看出其成功率達到91%,效果相當顯著