從 LIBSVM Data 取得 wine dataset (https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/wine.scale)。 透過read.libsvm.R
讀入資料並整理之。
#read in the data and convert to dataframe
source( 'read.libsvm.R' )
wine = read.libsvm( 'wine.scale', 13 )
wine = as.data.frame(wine)
#reassign attributes
names(wine) = c("type","alc","acid","ash","alk","mag","phenols","flav","nonflav","proanth","color","hue","OD","proline")
#encode wine type as factor
wine$type = as.factor(wine$type)
library(e1071)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lattice)
library(ggplot2)
library(caret)
透過繪圖找出能最能明顯區分出不同酒種的兩個變量。
先繪製所有變量的 scatter plot,找出較能明顯區分出不同酒種的變量組合。
#plot wine dataset
plot(wine)
取出上述繪圖中,較能明顯區分出不同酒種的變量組合再做 plot
,找出最能明顯區分出不同酒種的變量組合。
#plot(wine$acid, wine$OD, col=wine$type)
#plot(wine$acid, wine$phenols, col=wine$type)
#plot(wine$OD, wine$phenols, col=wine$type)
#plot(wine$color, wine$hue, col=wine$type)
#plot(wine$alc, wine$color, col=wine$type)
#plot(wine$alc, wine$hue, col=wine$type)
#plot(wine$alc, wine$flav, col=wine$type)
#plot(wine$flav, wine$color, col=wine$type)
plot(wine$flav, wine$hue, col=wine$type)
在以上各變量組合的 trial & error 後,發現組合 (flav, hue) 最能明顯區分出不同酒種。
首先先建立新的dataset,其中包含因變量類黃酮含量 (flav)、色澤(hue) ,以及自變量酒種 (type)。
#create new, smaller dataset
col = c("flav","hue","type")
dta = wine %>%
select(col)
其次,建立 testing data。
#create testing samples
testId = sample(nrow(dta),78, replace=FALSE)
x <- subset(dta[testId,], select = -type)
y <- dta$type[testId]
最後,透過 SVM 演算法得出 training model,代入 testing data 以測試成效。透過繪製 confusionMatrix(pred,y)
,得到預測模型準確率高達96%。
#training model
trainingId = dta[-testId,]
svm_model1 =
svmfit = svm(type ~ ., data = dta[-testId,])
pred = predict(svm_model1,x)
confusionMatrix(pred,y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 23 3 0
## 2 4 27 0
## 3 0 0 21
##
## Overall Statistics
##
## Accuracy : 0.9103
## 95% CI : (0.8238, 0.9632)
## No Information Rate : 0.3846
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8639
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.8519 0.9000 1.0000
## Specificity 0.9412 0.9167 1.0000
## Pos Pred Value 0.8846 0.8710 1.0000
## Neg Pred Value 0.9231 0.9362 1.0000
## Prevalence 0.3462 0.3846 0.2692
## Detection Rate 0.2949 0.3462 0.2692
## Detection Prevalence 0.3333 0.3974 0.2692
## Balanced Accuracy 0.8965 0.9083 1.0000