R16: Classification Trees
Classification trees (and regression trees, which are similar, but use numeric dependent variables) may be obtained with the rpart library.
library(rpart)
1. South African Heart Disease Data
A retrospective sample of males in a heart-disease high-risk region of the Western Cape, South Africa. There are roughly two controls per case of CHD. Many of the CHD positive men have undergone blood pressure reduction treatment and other programs to reduce their risk factors after their CHD event. In some cases the measurements were
made after these treatments. These data are taken from a larger dataset, described in Rousseauw et al, 1983, South African Medical Journal.
sbp systolic blood pressure
tobacco cumulative tobacco (kg)
ldl low densiity lipoprotein cholesterol
adiposity a well-recognized risk factor for type 2 diabetes
famhist family history of heart disease (Present, Absent)
typea type-A behavior
obesity
alcohol current alcohol consumption
age age at onset
chd response, coronary heart disease
heart<-read.table("http://www.uidaho.edu/~stevel/519/Data/SAheart.txt") head(heart)
sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 1
2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 1
3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 0
4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 1
5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 1
6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 0
pairs(heart, col=heart$chd+1, pch=heart$chd+1)
heart.cart<-rpart(chd~., data=heart, method="class")
heart.cart
n= 462
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 462 160 0 (0.65367965 0.34632035)
2) age< 50.5 290 64 0 (0.77931034 0.22068966)
4) age< 30.5 108 8 0 (0.92592593 0.07407407) *
5) age=30.5 182 56 0 (0.69230769 0.30769231)
10) typea< 68.5 170 46 0 (0.72941176 0.27058824) *
11) typea=68.5 12 2 1 (0.16666667 0.83333333) *
3) age=50.5 172 76 1 (0.44186047 0.55813953)
6) famhist=Absent 82 33 0 (0.59756098 0.40243902)
12) tobacco< 7.605 58 16 0 (0.72413793 0.27586207) *
13) tobacco=7.605 24 7 1 (0.29166667 0.70833333) *
7) famhist=Present 90 27 1 (0.30000000 0.70000000)
14) ldl< 4.99 39 18 1 (0.46153846 0.53846154)
28) adiposity=27.985 20 7 0 (0.65000000 0.35000000)
56) tobacco< 4.15 10 1 0 (0.90000000 0.10000000) *
57) tobacco=4.15 10 4 1 (0.40000000 0.60000000) *
29) adiposity< 27.985 19 5 1 (0.26315789 0.73684211) *
15) ldl=4.99 51 9 1 (0.17647059 0.82352941) *
plot(heart.cart,margin=.2)
text(heart.cart, use.n=FALSE, pretty=0)
heart.cart.pred <- predict(heart.cart, heart[,-10], type="class")
table(Actual=heart$chd, Classified=heart.cart.pred)
Classified
Actual 0 1
0 275 27
1 71 89
2. Cushings Syndrome Data
library(MASS)
Attaching package: 'MASS'
data(Cushings)
nrow(Cushings)
[1] 27
unk <- (1:27)[Cushings[,3]=="u"]
cush.ltp <- log(Cushings[-unk,1:2])
cush.class <- factor(Cushings[-unk,3])
cush.ltpc <- data.frame(cush.ltp,class=cush.class)
cush.cart <- rpart(class~., data=cush.ltpc, method="class",
minsplit=5)
plot(cush.cart,margin=.2)
text(cush.cart, use.n=FALSE, pretty=0)
plot(cush.ltp,col=unclass(cush.class),pch=as.character(cush.class))
segments(1.5757,-4, 1.5757,3) # Plot first split
segments(1.5757, .6931, 4.5, .6931) # Second split
summary(cush.cart)
Call:
rpart(formula = class ~ ., data = cush.ltpc, method = "class",
minsplit = 5)
n= 21
CP nsplit rel error xerror xstd
1 0.3636364 0 1.0000000 1.0000000 0.2080626
2 0.0100000 2 0.2727273 0.8181818 0.2061624
Node number 1: 21 observations, complexity param=0.3636364
predicted class=b expected loss=0.5238095
class counts: 6 10 5
probabilities: 0.286 0.476 0.238
left son=2 (8 obs) right son=3 (13 obs)
Primary splits:
Tetrahydrocortisone < 1.575727 to the left, improve=4.179487, (0 missing)
Pregnanetriol < 0.6931472 to the left, improve=3.761905, (0 missing)
Surrogate splits:
Pregnanetriol < -1.262864 to the left, agree=0.762, adj=0.375, (0 split)
Node number 2: 8 observations
predicted class=a expected loss=0.25
class counts: 6 2 0
probabilities: 0.750 0.250 0.000
Node number 3: 13 observations, complexity param=0.3636364
predicted class=b expected loss=0.3846154
class counts: 0 8 5
probabilities: 0.000 0.615 0.385
left son=6 (7 obs) right son=7 (6 obs)
Primary splits:
Pregnanetriol < 0.6931472 to the left, improve=4.487179, (0 missing)
Tetrahydrocortisone < 2.213739 to the left, improve=3.296703, (0 missing)
Surrogate splits:
Tetrahydrocortisone < 2.213739 to the left, agree=0.923, adj=0.833, (0 split)
Node number 6: 7 observations
predicted class=b expected loss=0
class counts: 0 7 0
probabilities: 0.000 1.000 0.000
Node number 7: 6 observations
predicted class=c expected loss=0.1666667
class counts: 0 1 5
probabilities: 0.000 0.167 0.833
3. Crabs Data
data(crabs)
crabs.sp <- crabs[,-c(2,3)]
crabs.cart <- rpart(sp ~ ., crabs.sp, method="class")
plot(crabs.cart,margin=.2)
text(crabs.cart, use.n=FALSE, pretty=0)
crabs.cart.pred <- predict(crabs.cart, crabs, type="class")
table(Actual=sp, Classified=crabs.cart.pred)
Classified
Actual B O
B 96 4
O 10 90
# Classification Trees
# Example - heart disease data
library(rpart)
heart <- read.table("SAheart.txt")
head(heart)
pairs(heart, col=heart$chd+1, pch=heart$chd+1)
heart.cart<-rpart(chd~., data=heart, method="class")
heart.cart
plot(heart.cart)
text(heart.cart, use.n=FALSE, pretty=0)
# Cushings syndrome data
library(MASS)
data(Cushings)
nrow(Cushings)
unk <- (1:27)[Cushings[,3]=="u"]
cush.ltp <- log(Cushings[-unk,1:2])
cush.class <- factor(Cushings[-unk,3])
cush.ltpc <- data.frame(cush.ltp,class=cush.class)
cush.cart <- rpart(class~., data=cush.ltpc, method="class", minsplit=5)
plot(cush.cart)
text(cush.cart, use.n=FALSE, pretty=0)
plot(cush.ltp,col=unclass(cush.class),pch=as.character(cush.class))
segments(1.5757,-4, 1.5757,3) # Plot first split
segments(1.5757, .6931, 4.5, .6931) # Second split
# Crabs data
data(crabs)
head(crabs)
crabs.sp <- crabs[,-c(2,3)]
crabs.cart <- rpart(sp ~ ., crabs.sp, method="class")
plot(crabs.cart)
text(crabs.cart, use.n=FALSE, pretty=0)
crabs.cart.pred <- predict(crabs.cart, crabs, type="class")
table(Actual=sp, Classified=crabs.cart.pred)