Data Ming and Classification with R Examples

Ira Sharenow

Wednesday, August 19, 2015

Points within sphere of radius 0.8 are labeled "blue"; otherwise "red"
But actually criterion is based on the first four of six co-ordinates
The region is clearly non-linear
Then points are slightly perturbed so the labeling will no longer be perfect
Out of six co-ordinates only first five are jittered
Then a sample of half the points is taken
Then various tests are performed on the hold out test set.

Since the boundary is very non-linear, I expected the first few methods to perform poorly,
and they did so.

Bagging, random forest, and boosting all produced excellent results. However, bossting is extremely slow.

  1. Actual percent red
  2. Logistic regression
  3. LDA
  4. QDA
  5. KNN
  6. Tree
  7. Bagging
  8. Random Forest
  9. Boosting
  10. Support Vector Machines
  11. Table of results

set.seed(20150819)
sampleSize =10000
df =data.frame(x1 =runif(n = sampleSize, min = -1, max =1),
x2 =runif(n = sampleSize, min = -1, max =1),
x3 =runif(n = sampleSize, min = -1, max =1),
x4 =runif(n = sampleSize, min = -1, max =1),
x5 =runif(n = sampleSize, min = -1, max =1),
x6 =runif(n = sampleSize, min = -1, max =1))
# Function for labeling
color =function(x1,x2,x3,x4,x5,x6) if (x1^2 +x2^2 +x3^2 +x4^2 <=0.8 ) return("blue") else return("red")
head(df)

## x1 x2 x3 x4 x5 x6
## 1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432
## 2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355
## 3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296
## 4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908
## 5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541
## 6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181

df$color1 =mapply(color, df$x1, df$x2,df$x3, df$x4,df$x5, df$x6)
df$color1 =factor(df$color1)
contrasts(df$color1) # blue is 0, red is 1

## red
## blue 0
## red 1

head(df)

## x1 x2 x3 x4 x5 x6
## 1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432
## 2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355
## 3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296
## 4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908
## 5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541
## 6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181
## color1
## 1 red
## 2 red
## 3 red
## 4 red
## 5 red
## 6 red

mean(df$color1 == "blue")

## [1] 0.1919

# Perturb co-ordinates 1-5
df$x1 =df$x1 +rnorm(sampleSize, 0, 0.1)
df$x2 =df$x2 +rnorm(sampleSize, 0, 0.1)
df$x3 =df$x3 +rnorm(sampleSize, 0, 0.1)
df$x4 =df$x4 +rnorm(sampleSize, 0, 0.1)
df$x5 =df$x5 +rnorm(sampleSize, 0, 0.1)
# Divide points into training and test sets
train =sample(1:sampleSize, sampleSize/2)
dfTrain =df[train, ]
dim(dfTrain)

## [1] 5000 7

dfTest =df[-c(train), ]
dim(dfTest)

## [1] 5000 7

# Set up is now completed
# Now start to perform tests

# 1. Logistic Regression
glm.fit =glm(color1 ~., data = dfTrain, family = binomial)
summary(glm.fit)

##
## Call:
## glm(formula = color1 ~ ., family = binomial, data = dfTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9033 0.6213 0.6528 0.6756 0.7553
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.404084 0.035614 39.426 <2e-16 ***
## x1 -0.080827 0.060590 -1.334 0.182
## x2 -0.043699 0.060668 -0.720 0.471
## x3 0.008068 0.060706 0.133 0.894
## x4 -0.058953 0.060893 -0.968 0.333
## x5 0.082850 0.060944 1.359 0.174
## x6 0.064199 0.061538 1.043 0.297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4965.0 on 4999 degrees of freedom
## Residual deviance: 4958.8 on 4993 degrees of freedom
## AIC: 4972.8
##
## Number of Fisher Scoring iterations: 4

# help(predict)
glm.probs =predict(glm.fit, newdata = dfTest, type ="response")
glm.probs[1:10]

## 1 4 6 9 11 12 13
## 0.8053388 0.8050761 0.8259650 0.8059137 0.8093152 0.8082293 0.8039616
## 16 17 18
## 0.8077144 0.8256474 0.8096335

glm.pred =rep("red", sampleSize/2)
glm.pred[glm.probs <.5]= "blue"
table(glm.pred, dfTest$color1)

##
## glm.pred blue red
## red 933 4067

mean(glm.pred ==df$color1) # 0.8081

## [1] 0.8081

# 2. Linear Discriminant Analysis
library( MASS)
lda.fit =lda(color1 ~., data = dfTrain)
lda.fit

## Call:
## lda(color1 ~ ., data = dfTrain)
##
## Prior probabilities of groups:
## blue red
## 0.1972 0.8028
##
## Group means:
## x1 x2 x3 x4 x5
## blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305
## red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678
## x6
## blue 0.002238118
## red 0.023971066
##
## Coefficients of linear discriminants:
## LD1
## x1 -0.91223082
## x2 -0.49229573
## x3 0.09083985
## x4 -0.66651260
## x5 0.93584612
## x6 0.72461144

lda.pred =predict(lda.fit, dfTest)
names(lda.pred )

## [1] "class" "posterior" "x"

lda.class =lda.pred$class
table(lda.class, dfTest$color1)

##
## lda.class blue red
## blue 0 0
## red 933 4067

mean(lda.class ==dfTest$color1) # 0.8134

## [1] 0.8134

# 3. Quadratic Discriminant Analysis
qda.fit =qda(color1 ~., data = dfTrain)
qda.fit

## Call:
## qda(color1 ~ ., data = dfTrain)
##
## Prior probabilities of groups:
## blue red
## 0.1972 0.8028
##
## Group means:
## x1 x2 x3 x4 x5
## blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305
## red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678
## x6
## blue 0.002238118
## red 0.023971066

qda.pred =predict(qda.fit, dfTest)
qda.class =qda.pred$class
table(qda.class, dfTest$color1)

##
## qda.class blue red
## blue 131 0
## red 802 4067

mean(qda.class ==dfTest$color1) # 0.8396)

## [1] 0.8396

# 4. KNN
library(class)
knn.pred3 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =3)
table(knn.pred3, dfTest$color1)

##
## knn.pred3 blue red
## blue 596 267
## red 337 3800

mean(knn.pred3 ==dfTest$color1) # 0.8792

## [1] 0.8792

knn.pred5 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =5)
table(knn.pred5, dfTest$color1)

##
## knn.pred5 blue red
## blue 597 206
## red 336 3861

mean(knn.pred5 ==dfTest$color1) # 0.8916

## [1] 0.8916

knn.pred7 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =7)
table(knn.pred7, dfTest$color1)

##
## knn.pred7 blue red
## blue 593 176
## red 340 3891

mean(knn.pred7 ==dfTest$color1) # 0.8968

## [1] 0.8968

knn.pred15 =knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k =15)
table(knn.pred15, dfTest$color1)

##
## knn.pred15 blue red
## blue 574 117
## red 359 3950

mean(knn.pred15 ==dfTest$color1) # 0.9048

## [1] 0.9048

# 5. Tree
library(tree)

## Warning: package 'tree' was built under R version 3.1.3

tree.colors =tree( color1 ~., dfTrain)
summary(tree.colors)

##
## Classification tree:
## tree(formula = color1 ~ ., data = dfTrain)
## Variables actually used in tree construction:
## [1] "x1" "x4" "x3" "x2"
## Number of terminal nodes: 11
## Residual mean deviance: 0.5755 = 2871 / 4989
## Misclassification error rate: 0.0974 = 487 / 5000

tree.colors

## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 4965.00 red ( 0.19720 0.80280 )
## 2) x1 < -0.650728 873 254.50 red ( 0.03322 0.96678 ) *
## 3) x1 > -0.650728 4127 4470.00 red ( 0.23189 0.76811 )
## 6) x1 < 0.641545 3206 3829.00 red ( 0.28447 0.71553 )
## 12) x4 < -0.642267 588 254.20 red ( 0.05612 0.94388 ) *
## 13) x4 > -0.642267 2618 3341.00 red ( 0.33575 0.66425 )
## 26) x4 < 0.546652 1920 2618.00 red ( 0.42448 0.57552 )
## 52) x3 < 0.678641 1612 2234.00 red ( 0.48883 0.51117 )
## 104) x3 < -0.549429 421 358.90 red ( 0.15202 0.84798 ) *
## 105) x3 > -0.549429 1191 1595.00 blue ( 0.60789 0.39211 )
## 210) x2 < 0.774734 1063 1343.00 blue ( 0.67357 0.32643 )
## 420) x2 < -0.675639 186 167.60 red ( 0.16667 0.83333 ) *
## 421) x2 > -0.675639 877 921.80 blue ( 0.78107 0.21893 )
## 842) x2 < 0.551288 757 691.20 blue ( 0.82959 0.17041 )
## 1684) x2 < -0.435063 152 202.10 blue ( 0.61842 0.38158 ) *
## 1685) x2 > -0.435063 605 437.60 blue ( 0.88264 0.11736 ) *
## 843) x2 > 0.551288 120 166.10 red ( 0.47500 0.52500 ) *
## 211) x2 > 0.774734 128 59.85 red ( 0.06250 0.93750 ) *
## 53) x3 > 0.678641 308 183.00 red ( 0.08766 0.91234 ) *
## 27) x4 > 0.546652 698 427.80 red ( 0.09169 0.90831 ) *
## 7) x1 > 0.641545 921 359.50 red ( 0.04886 0.95114 ) *

tree.pred =predict(tree.colors, dfTest, type ="class")
table(tree.pred, dfTest$color1)

##
## tree.pred blue red
## blue 574 145
## red 359 3922

mean(tree.pred ==dfTest$color1)

## [1] 0.8992

# Try pruning via cross-validation
cv.colors =cv.tree(tree.colors, FUN = prune.misclass)
names(cv.colors)

## [1] "size" "dev" "k" "method"

cv.colors

## $size
## [1] 11 10 9 1
##
## $dev
## [1] 548 548 550 986
##
## $k
## [1] -Inf 0.000 6.000 61.625
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"

# Now prune the tree
prune.colors =prune.misclass(tree.colors, best =15 )

## Warning in prune.tree(tree = tree.colors, best = 15, method = "misclass"):
## best is bigger than tree size

plot(prune.colors)
text(prune.colors, pretty =0)

# Performance on test set
tree.pred =predict (prune.colors, dfTest, type ="class")
table(tree.pred, dfTest$color1)

##
## tree.pred blue red
## blue 574 145
## red 359 3922

mean(tree.pred ==dfTest$color1) # 0.8992

## [1] 0.8992

# 6 Bagging
library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

bag.colors =randomForest(color1 ~., data = dfTrain,
mtry =6, importance =TRUE)
yhat.bag =predict(bag.colors, newdata = dfTest)
table(yhat.bag, dfTest$color1)

##
## yhat.bag blue red
## blue 676 119
## red 257 3948

mean(yhat.bag ==dfTest$color1) # 0.9248

## [1] 0.9248

# 7. Random Forest
# Use 3 of 6 predictors
bag.colorsF3 =randomForest(color1 ~., data = dfTrain,
mtry =3, importance =TRUE)
yhat.bagF3 =predict(bag.colors, newdata = dfTest)
table(yhat.bagF3, dfTest$color1)

##
## yhat.bagF3 blue red
## blue 675 120
## red 258 3947

mean(yhat.bagF3 ==dfTest$color1)

## [1] 0.9244

sum(yhat.bagF3 ==dfTest$color1)

## [1] 4622

# Use 2 of 6 predictors
bag.colorsF2 =randomForest(color1 ~., data = dfTrain,
mtry =2, importance =TRUE)
yhat.bagF2 =predict(bag.colors, newdata = dfTest)
table(yhat.bagF2, dfTest$color1)

##
## yhat.bagF2 blue red
## blue 675 119
## red 258 3948

mean(yhat.bagF2 ==dfTest$color1) # 0.9246

## [1] 0.9246

table(yhat.bag ==yhat.bagF2)

##
## FALSE TRUE
## 1 4999

mean(yhat.bag ==yhat.bagF2) # 0.9998

## [1] 0.9998

importance(bag.colors)

## blue red MeanDecreaseAccuracy MeanDecreaseGini
## x1 154.2442790 123.3634200 176.245850 336.74727
## x2 142.5286583 107.2910048 160.290965 376.21805
## x3 142.0714991 110.0119122 163.214468 354.68932
## x4 151.4785433 117.4098121 180.011101 344.86527
## x5 0.9033374 3.5341044 3.557608 89.98382
## x6 -2.3339885 0.6331717 -0.676042 79.20312

importance(bag.colorsF2)

## blue red MeanDecreaseAccuracy MeanDecreaseGini
## x1 119.986645 98.646841 138.8348004 350.4371
## x2 108.844194 84.839652 121.1783311 331.8474
## x3 115.647941 90.477256 127.1313051 340.0016
## x4 122.994365 95.045096 134.3153792 350.9749
## x5 3.263936 1.152052 2.7831518 107.2180
## x6 -1.215705 1.427644 0.4685702 101.0973

importance(bag.colorsF3)

## blue red MeanDecreaseAccuracy MeanDecreaseGini
## x1 141.119517 106.3559363 164.9998082 348.7953
## x2 130.473486 100.7578932 143.6887800 351.8899
## x3 127.689675 105.8876836 152.4531690 348.3248
## x4 140.001527 109.0224862 163.5321583 352.3392
## x5 1.601669 -1.4202147 -0.2838376 94.0987
## x6 -2.164920 -0.4052774 -1.6139238 86.1505

varImpPlot(bag.colorsF3)
# 8 Boosting
library(caret) # use this library to help with set up

## Loading required package: lattice
## Loading required package: ggplot2

library(gbm)

## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1

system.time(boost.colors <-train(color1 ~., method ="gbm", data = dfTrain, verbose =FALSE))

## Loading required package: plyr

## user system elapsed
## 55.51 0.08 56.52

ctrl <-trainControl(method ="repeatedcv", repeats =5,
classProbs =TRUE,
summaryFunction = twoClassSummary)
grid <-expand.grid(interaction.depth =seq(3, 7, by =2),
n.trees =seq(100, 1000, by =50),
shrinkage =c(0.01, 0.1))
system.time(gbmTune <-train(color1 ~., data = dfTrain,
method ="gbm",
metric ="ROC",
tuneGrid = grid,
verbose =FALSE,
trControl = ctrl))

## user system elapsed
## 1923.45 0.51 1935.69

library(ggplot2)
ggplot(gbmTune) +theme(legend.position ="top")

gbmPred =predict(gbmTune, dfTest)
gbmProbs =predict(gbmTune, dfTest, type ="prob")
confusionMatrix(gbmPred, dfTest$color1)

## Confusion Matrix and Statistics
##
## Reference
## Prediction blue red
## blue 689 92
## red 244 3975
##
## Accuracy : 0.9328
## 95% CI : (0.9255, 0.9396)
## No Information Rate : 0.8134
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7638
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7385
## Specificity : 0.9774
## Pos Pred Value : 0.8822
## Neg Pred Value : 0.9422
## Prevalence : 0.1866
## Detection Rate : 0.1378
## Detection Prevalence : 0.1562
## Balanced Accuracy : 0.8579
##
## 'Positive' Class : blue
##

mean(gbmPred ==dfTest$color1)

## [1] 0.9328

sum(gbmPred ==dfTest$color1)

## [1] 4664

Results Percent Correct

  1. Actual color is red: 80.81
  2. Logistic Regression: 80.81
  3. LDA: 81.34
  4. QDA: 83.96
  5. KNN (K = 15): 90.48 best result for KNN
  6. Tree: 89.92
  7. Bagging: 92.48
  8. Random Forest: 92.46 best result for RF
  9. Boosting: 93.14