Data Ming and Classification with R Examples
Ira Sharenow
Wednesday, August 19, 2015
Points within sphere of radius 0.8 are labeled "blue"; otherwise "red"
But actually criterion is based on the first four of six co-ordinates
The region is clearly non-linear
Then points are slightly perturbed so the labeling will no longer be perfect
Out of six co-ordinates only first five are jittered
Then a sample of half the points is taken
Then various tests are performed on the hold out test set.
Since the boundary is very non-linear, I expected the first few methods to perform poorly,
and they did so.
Bagging, random forest, and boosting all produced excellent results. However, bossting is extremely slow.
- Actual percent red
- Logistic regression
- LDA
- QDA
- KNN
- Tree
- Bagging
- Random Forest
- Boosting
- Support Vector Machines
- Table of results
set.seed(20150819)
sampleSize =10000
df =data.frame(x1 =runif(n = sampleSize, min = -1, max =1),
x2 =runif(n = sampleSize, min = -1, max =1),
x3 =runif(n = sampleSize, min = -1, max =1),
x4 =runif(n = sampleSize, min = -1, max =1),
x5 =runif(n = sampleSize, min = -1, max =1),
x6 =runif(n = sampleSize, min = -1, max =1))
# Function for labeling
color =function(x1,x2,x3,x4,x5,x6) if (x1^2 +x2^2 +x3^2 +x4^2 <=0.8 ) return("blue") else return("red")
head(df)
## x1 x2 x3 x4 x5 x6
## 1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432
## 2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355
## 3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296
## 4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908
## 5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541
## 6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181
df$color1 =mapply(color, df$x1, df$x2,df$x3, df$x4,df$x5, df$x6)
df$color1 =factor(df$color1)
contrasts(df$color1) # blue is 0, red is 1
## red
## blue 0
## red 1
head(df)
## x1 x2 x3 x4 x5 x6
## 1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432
## 2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355
## 3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296
## 4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908
## 5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541
## 6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181
## color1
## 1 red
## 2 red
## 3 red
## 4 red
## 5 red
## 6 red
mean(df$color1 == "blue")
## [1] 0.1919
# Perturb co-ordinates 1-5
df$x1 =df$x1 +rnorm(sampleSize, 0, 0.1)
df$x2 =df$x2 +rnorm(sampleSize, 0, 0.1)
df$x3 =df$x3 +rnorm(sampleSize, 0, 0.1)
df$x4 =df$x4 +rnorm(sampleSize, 0, 0.1)
df$x5 =df$x5 +rnorm(sampleSize, 0, 0.1)
# Divide points into training and test sets
train =sample(1:sampleSize, sampleSize/2)
dfTrain =df[train, ]
dim(dfTrain)
## [1] 5000 7
dfTest =df[-c(train), ]
dim(dfTest)
## [1] 5000 7
# Set up is now completed
# Now start to perform tests
# 1. Logistic Regression
glm.fit =glm(color1 ~., data = dfTrain, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = color1 ~ ., family = binomial, data = dfTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9033 0.6213 0.6528 0.6756 0.7553
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.404084 0.035614 39.426 <2e-16 ***
## x1 -0.080827 0.060590 -1.334 0.182
## x2 -0.043699 0.060668 -0.720 0.471
## x3 0.008068 0.060706 0.133 0.894
## x4 -0.058953 0.060893 -0.968 0.333
## x5 0.082850 0.060944 1.359 0.174
## x6 0.064199 0.061538 1.043 0.297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4965.0 on 4999 degrees of freedom
## Residual deviance: 4958.8 on 4993 degrees of freedom
## AIC: 4972.8
##
## Number of Fisher Scoring iterations: 4
# help(predict)
glm.probs =predict(glm.fit, newdata = dfTest, type ="response")
glm.probs[1:10]
## 1 4 6 9 11 12 13
## 0.8053388 0.8050761 0.8259650 0.8059137 0.8093152 0.8082293 0.8039616
## 16 17 18
## 0.8077144 0.8256474 0.8096335
glm.pred =rep("red", sampleSize/2)
glm.pred[glm.probs <.5]= "blue"
table(glm.pred, dfTest$color1)
##
## glm.pred blue red
## red 933 4067
mean(glm.pred ==df$color1) # 0.8081
## [1] 0.8081
# 2. Linear Discriminant Analysis
library( MASS)
lda.fit =lda(color1 ~., data = dfTrain)
lda.fit
## Call:
## lda(color1 ~ ., data = dfTrain)
##
## Prior probabilities of groups:
## blue red
## 0.1972 0.8028
##
## Group means:
## x1 x2 x3 x4 x5
## blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305
## red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678
## x6
## blue 0.002238118
## red 0.023971066
##
## Coefficients of linear discriminants:
## LD1
## x1 -0.91223082
## x2 -0.49229573
## x3 0.09083985
## x4 -0.66651260
## x5 0.93584612
## x6 0.72461144
lda.pred =predict(lda.fit, dfTest)
names(lda.pred )
## [1] "class" "posterior" "x"
lda.class =lda.pred$class
table(lda.class, dfTest$color1)
##
## lda.class blue red
## blue 0 0
## red 933 4067
mean(lda.class ==dfTest$color1) # 0.8134
## [1] 0.8134
# 3. Quadratic Discriminant Analysis
qda.fit =qda(color1 ~., data = dfTrain)
qda.fit
## Call:
## qda(color1 ~ ., data = dfTrain)
##
## Prior probabilities of groups:
## blue red
## 0.1972 0.8028
##
## Group means:
## x1 x2 x3 x4 x5
## blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305
## red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678
## x6
## blue 0.002238118
## red 0.023971066
qda.pred =predict(qda.fit, dfTest)
qda.class =qda.pred$class
table(qda.class, dfTest$color1)
##
## qda.class blue red
## blue 131 0
## red 802 4067
mean(qda.class ==dfTest$color1) # 0.8396)
## [1] 0.8396
# 4. KNN
library(class)
knn.pred3 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =3)
table(knn.pred3, dfTest$color1)
##
## knn.pred3 blue red
## blue 596 267
## red 337 3800
mean(knn.pred3 ==dfTest$color1) # 0.8792
## [1] 0.8792
knn.pred5 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =5)
table(knn.pred5, dfTest$color1)
##
## knn.pred5 blue red
## blue 597 206
## red 336 3861
mean(knn.pred5 ==dfTest$color1) # 0.8916
## [1] 0.8916
knn.pred7 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =7)
table(knn.pred7, dfTest$color1)
##
## knn.pred7 blue red
## blue 593 176
## red 340 3891
mean(knn.pred7 ==dfTest$color1) # 0.8968
## [1] 0.8968
knn.pred15 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =15)
table(knn.pred15, dfTest$color1)
##
## knn.pred15 blue red
## blue 574 117
## red 359 3950
mean(knn.pred15 ==dfTest$color1) # 0.9048
## [1] 0.9048
# 5. Tree
library(tree)
## Warning: package 'tree' was built under R version 3.1.3
tree.colors =tree( color1 ~., dfTrain)
summary(tree.colors)
##
## Classification tree:
## tree(formula = color1 ~ ., data = dfTrain)
## Variables actually used in tree construction:
## [1] "x1" "x4" "x3" "x2"
## Number of terminal nodes: 11
## Residual mean deviance: 0.5755 = 2871 / 4989
## Misclassification error rate: 0.0974 = 487 / 5000
tree.colors
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 4965.00 red ( 0.19720 0.80280 )
## 2) x1 < -0.650728 873 254.50 red ( 0.03322 0.96678 ) *
## 3) x1 > -0.650728 4127 4470.00 red ( 0.23189 0.76811 )
## 6) x1 < 0.641545 3206 3829.00 red ( 0.28447 0.71553 )
## 12) x4 < -0.642267 588 254.20 red ( 0.05612 0.94388 ) *
## 13) x4 > -0.642267 2618 3341.00 red ( 0.33575 0.66425 )
## 26) x4 < 0.546652 1920 2618.00 red ( 0.42448 0.57552 )
## 52) x3 < 0.678641 1612 2234.00 red ( 0.48883 0.51117 )
## 104) x3 < -0.549429 421 358.90 red ( 0.15202 0.84798 ) *
## 105) x3 > -0.549429 1191 1595.00 blue ( 0.60789 0.39211 )
## 210) x2 < 0.774734 1063 1343.00 blue ( 0.67357 0.32643 )
## 420) x2 < -0.675639 186 167.60 red ( 0.16667 0.83333 ) *
## 421) x2 > -0.675639 877 921.80 blue ( 0.78107 0.21893 )
## 842) x2 < 0.551288 757 691.20 blue ( 0.82959 0.17041 )
## 1684) x2 < -0.435063 152 202.10 blue ( 0.61842 0.38158 ) *
## 1685) x2 > -0.435063 605 437.60 blue ( 0.88264 0.11736 ) *
## 843) x2 > 0.551288 120 166.10 red ( 0.47500 0.52500 ) *
## 211) x2 > 0.774734 128 59.85 red ( 0.06250 0.93750 ) *
## 53) x3 > 0.678641 308 183.00 red ( 0.08766 0.91234 ) *
## 27) x4 > 0.546652 698 427.80 red ( 0.09169 0.90831 ) *
## 7) x1 > 0.641545 921 359.50 red ( 0.04886 0.95114 ) *
tree.pred =predict(tree.colors, dfTest, type ="class")
table(tree.pred, dfTest$color1)
##
## tree.pred blue red
## blue 574 145
## red 359 3922
mean(tree.pred ==dfTest$color1)
## [1] 0.8992
# Try pruning via cross-validation
cv.colors =cv.tree(tree.colors, FUN = prune.misclass)
names(cv.colors)
## [1] "size" "dev" "k" "method"
cv.colors
## $size
## [1] 11 10 9 1
##
## $dev
## [1] 548 548 550 986
##
## $k
## [1] -Inf 0.000 6.000 61.625
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
# Now prune the tree
prune.colors =prune.misclass(tree.colors, best =15 )
## Warning in prune.tree(tree = tree.colors, best = 15, method = "misclass"):
## best is bigger than tree size
plot(prune.colors)
text(prune.colors, pretty =0)
# Performance on test set
tree.pred =predict (prune.colors, dfTest, type ="class")
table(tree.pred, dfTest$color1)
##
## tree.pred blue red
## blue 574 145
## red 359 3922
mean(tree.pred ==dfTest$color1) # 0.8992
## [1] 0.8992
# 6 Bagging
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
bag.colors =randomForest(color1 ~., data = dfTrain,
mtry =6, importance =TRUE)
yhat.bag =predict(bag.colors, newdata = dfTest)
table(yhat.bag, dfTest$color1)
##
## yhat.bag blue red
## blue 676 119
## red 257 3948
mean(yhat.bag ==dfTest$color1) # 0.9248
## [1] 0.9248
# 7. Random Forest
# Use 3 of 6 predictors
bag.colorsF3 =randomForest(color1 ~., data = dfTrain,
mtry =3, importance =TRUE)
yhat.bagF3 =predict(bag.colors, newdata = dfTest)
table(yhat.bagF3, dfTest$color1)
##
## yhat.bagF3 blue red
## blue 675 120
## red 258 3947
mean(yhat.bagF3 ==dfTest$color1)
## [1] 0.9244
sum(yhat.bagF3 ==dfTest$color1)
## [1] 4622
# Use 2 of 6 predictors
bag.colorsF2 =randomForest(color1 ~., data = dfTrain,
mtry =2, importance =TRUE)
yhat.bagF2 =predict(bag.colors, newdata = dfTest)
table(yhat.bagF2, dfTest$color1)
##
## yhat.bagF2 blue red
## blue 675 119
## red 258 3948
mean(yhat.bagF2 ==dfTest$color1) # 0.9246
## [1] 0.9246
table(yhat.bag ==yhat.bagF2)
##
## FALSE TRUE
## 1 4999
mean(yhat.bag ==yhat.bagF2) # 0.9998
## [1] 0.9998
importance(bag.colors)
## blue red MeanDecreaseAccuracy MeanDecreaseGini
## x1 154.2442790 123.3634200 176.245850 336.74727
## x2 142.5286583 107.2910048 160.290965 376.21805
## x3 142.0714991 110.0119122 163.214468 354.68932
## x4 151.4785433 117.4098121 180.011101 344.86527
## x5 0.9033374 3.5341044 3.557608 89.98382
## x6 -2.3339885 0.6331717 -0.676042 79.20312
importance(bag.colorsF2)
## blue red MeanDecreaseAccuracy MeanDecreaseGini
## x1 119.986645 98.646841 138.8348004 350.4371
## x2 108.844194 84.839652 121.1783311 331.8474
## x3 115.647941 90.477256 127.1313051 340.0016
## x4 122.994365 95.045096 134.3153792 350.9749
## x5 3.263936 1.152052 2.7831518 107.2180
## x6 -1.215705 1.427644 0.4685702 101.0973
importance(bag.colorsF3)
## blue red MeanDecreaseAccuracy MeanDecreaseGini
## x1 141.119517 106.3559363 164.9998082 348.7953
## x2 130.473486 100.7578932 143.6887800 351.8899
## x3 127.689675 105.8876836 152.4531690 348.3248
## x4 140.001527 109.0224862 163.5321583 352.3392
## x5 1.601669 -1.4202147 -0.2838376 94.0987
## x6 -2.164920 -0.4052774 -1.6139238 86.1505
varImpPlot(bag.colorsF3)
# 8 Boosting
library(caret) # use this library to help with set up
## Loading required package: lattice
## Loading required package: ggplot2
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1
system.time(boost.colors <-train(color1 ~., method ="gbm", data = dfTrain, verbose =FALSE))
## Loading required package: plyr
## user system elapsed
## 55.51 0.08 56.52
ctrl <-trainControl(method ="repeatedcv", repeats =5,
classProbs =TRUE,
summaryFunction = twoClassSummary)
grid <-expand.grid(interaction.depth =seq(3, 7, by =2),
n.trees =seq(100, 1000, by =50),
shrinkage =c(0.01, 0.1))
system.time(gbmTune <-train(color1 ~., data = dfTrain,
method ="gbm",
metric ="ROC",
tuneGrid = grid,
verbose =FALSE,
trControl = ctrl))
## user system elapsed
## 1923.45 0.51 1935.69
library(ggplot2)
ggplot(gbmTune) +theme(legend.position ="top")
gbmPred =predict(gbmTune, dfTest)
gbmProbs =predict(gbmTune, dfTest, type ="prob")
confusionMatrix(gbmPred, dfTest$color1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction blue red
## blue 689 92
## red 244 3975
##
## Accuracy : 0.9328
## 95% CI : (0.9255, 0.9396)
## No Information Rate : 0.8134
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7638
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7385
## Specificity : 0.9774
## Pos Pred Value : 0.8822
## Neg Pred Value : 0.9422
## Prevalence : 0.1866
## Detection Rate : 0.1378
## Detection Prevalence : 0.1562
## Balanced Accuracy : 0.8579
##
## 'Positive' Class : blue
##
mean(gbmPred ==dfTest$color1)
## [1] 0.9328
sum(gbmPred ==dfTest$color1)
## [1] 4664
Results Percent Correct
- Actual color is red: 80.81
- Logistic Regression: 80.81
- LDA: 81.34
- QDA: 83.96
- KNN (K = 15): 90.48 best result for KNN
- Tree: 89.92
- Bagging: 92.48
- Random Forest: 92.46 best result for RF
- Boosting: 93.14