R16: Classification Trees

Classification trees (and regression trees, which are similar, but use numeric dependent variables) may be obtained with the rpart library.

library(rpart)

1. South African Heart Disease Data

A retrospective sample of males in a heart-disease high-risk region of the Western Cape, South Africa. There are roughly two controls per case of CHD. Many of the CHD positive men have undergone blood pressure reduction treatment and other programs to reduce their risk factors after their CHD event. In some cases the measurements were

made after these treatments. These data are taken from a larger dataset, described in Rousseauw et al, 1983, South African Medical Journal.

sbp systolic blood pressure

tobacco cumulative tobacco (kg)

ldl low densiity lipoprotein cholesterol

adiposity a well-recognized risk factor for type 2 diabetes

famhist family history of heart disease (Present, Absent)

typea type-A behavior

obesity

alcohol current alcohol consumption

age age at onset

chd response, coronary heart disease

heart<-read.table("http://www.uidaho.edu/~stevel/519/Data/SAheart.txt") head(heart)

sbp tobacco ldl adiposity famhist typea obesity alcohol age chd

1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 1

2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 1

3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 0

4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 1

5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 1

6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 0

pairs(heart, col=heart$chd+1, pch=heart$chd+1)


heart.cart<-rpart(chd~., data=heart, method="class")

heart.cart

n= 462

node), split, n, loss, yval, (yprob)

* denotes terminal node

1) root 462 160 0 (0.65367965 0.34632035)

2) age< 50.5 290 64 0 (0.77931034 0.22068966)

4) age< 30.5 108 8 0 (0.92592593 0.07407407) *

5) age=30.5 182 56 0 (0.69230769 0.30769231)

10) typea< 68.5 170 46 0 (0.72941176 0.27058824) *

11) typea=68.5 12 2 1 (0.16666667 0.83333333) *

3) age=50.5 172 76 1 (0.44186047 0.55813953)

6) famhist=Absent 82 33 0 (0.59756098 0.40243902)

12) tobacco< 7.605 58 16 0 (0.72413793 0.27586207) *

13) tobacco=7.605 24 7 1 (0.29166667 0.70833333) *

7) famhist=Present 90 27 1 (0.30000000 0.70000000)

14) ldl< 4.99 39 18 1 (0.46153846 0.53846154)

28) adiposity=27.985 20 7 0 (0.65000000 0.35000000)

56) tobacco< 4.15 10 1 0 (0.90000000 0.10000000) *

57) tobacco=4.15 10 4 1 (0.40000000 0.60000000) *

29) adiposity< 27.985 19 5 1 (0.26315789 0.73684211) *

15) ldl=4.99 51 9 1 (0.17647059 0.82352941) *

plot(heart.cart,margin=.2)

text(heart.cart, use.n=FALSE, pretty=0)

heart.cart.pred <- predict(heart.cart, heart[,-10], type="class")

table(Actual=heart$chd, Classified=heart.cart.pred)

Classified

Actual 0 1

0 275 27

1 71 89


2. Cushings Syndrome Data

library(MASS)

Attaching package: 'MASS'

data(Cushings)

nrow(Cushings)

[1] 27

unk <- (1:27)[Cushings[,3]=="u"]

cush.ltp <- log(Cushings[-unk,1:2])

cush.class <- factor(Cushings[-unk,3])

cush.ltpc <- data.frame(cush.ltp,class=cush.class)

cush.cart <- rpart(class~., data=cush.ltpc, method="class",

minsplit=5)

plot(cush.cart,margin=.2)

text(cush.cart, use.n=FALSE, pretty=0)

plot(cush.ltp,col=unclass(cush.class),pch=as.character(cush.class))

segments(1.5757,-4, 1.5757,3) # Plot first split

segments(1.5757, .6931, 4.5, .6931) # Second split

summary(cush.cart)

Call:

rpart(formula = class ~ ., data = cush.ltpc, method = "class",

minsplit = 5)

n= 21

CP nsplit rel error xerror xstd

1 0.3636364 0 1.0000000 1.0000000 0.2080626

2 0.0100000 2 0.2727273 0.8181818 0.2061624

Node number 1: 21 observations, complexity param=0.3636364

predicted class=b expected loss=0.5238095

class counts: 6 10 5

probabilities: 0.286 0.476 0.238

left son=2 (8 obs) right son=3 (13 obs)

Primary splits:

Tetrahydrocortisone < 1.575727 to the left, improve=4.179487, (0 missing)

Pregnanetriol < 0.6931472 to the left, improve=3.761905, (0 missing)

Surrogate splits:

Pregnanetriol < -1.262864 to the left, agree=0.762, adj=0.375, (0 split)

Node number 2: 8 observations

predicted class=a expected loss=0.25

class counts: 6 2 0

probabilities: 0.750 0.250 0.000

Node number 3: 13 observations, complexity param=0.3636364

predicted class=b expected loss=0.3846154

class counts: 0 8 5

probabilities: 0.000 0.615 0.385

left son=6 (7 obs) right son=7 (6 obs)

Primary splits:

Pregnanetriol < 0.6931472 to the left, improve=4.487179, (0 missing)

Tetrahydrocortisone < 2.213739 to the left, improve=3.296703, (0 missing)

Surrogate splits:

Tetrahydrocortisone < 2.213739 to the left, agree=0.923, adj=0.833, (0 split)

Node number 6: 7 observations

predicted class=b expected loss=0

class counts: 0 7 0

probabilities: 0.000 1.000 0.000

Node number 7: 6 observations

predicted class=c expected loss=0.1666667

class counts: 0 1 5

probabilities: 0.000 0.167 0.833

3. Crabs Data

data(crabs)

crabs.sp <- crabs[,-c(2,3)]

crabs.cart <- rpart(sp ~ ., crabs.sp, method="class")

plot(crabs.cart,margin=.2)

text(crabs.cart, use.n=FALSE, pretty=0)

crabs.cart.pred <- predict(crabs.cart, crabs, type="class")

table(Actual=sp, Classified=crabs.cart.pred)

Classified

Actual B O

B 96 4

O 10 90

# Classification Trees

# Example - heart disease data

library(rpart)

heart <- read.table("SAheart.txt")

head(heart)

pairs(heart, col=heart$chd+1, pch=heart$chd+1)

heart.cart<-rpart(chd~., data=heart, method="class")

heart.cart

plot(heart.cart)

text(heart.cart, use.n=FALSE, pretty=0)

# Cushings syndrome data

library(MASS)

data(Cushings)

nrow(Cushings)

unk <- (1:27)[Cushings[,3]=="u"]

cush.ltp <- log(Cushings[-unk,1:2])

cush.class <- factor(Cushings[-unk,3])

cush.ltpc <- data.frame(cush.ltp,class=cush.class)

cush.cart <- rpart(class~., data=cush.ltpc, method="class", minsplit=5)

plot(cush.cart)

text(cush.cart, use.n=FALSE, pretty=0)

plot(cush.ltp,col=unclass(cush.class),pch=as.character(cush.class))

segments(1.5757,-4, 1.5757,3) # Plot first split

segments(1.5757, .6931, 4.5, .6931) # Second split

# Crabs data

data(crabs)

head(crabs)

crabs.sp <- crabs[,-c(2,3)]

crabs.cart <- rpart(sp ~ ., crabs.sp, method="class")

plot(crabs.cart)

text(crabs.cart, use.n=FALSE, pretty=0)

crabs.cart.pred <- predict(crabs.cart, crabs, type="class")

table(Actual=sp, Classified=crabs.cart.pred)