Section 9.3.5: CART for Baseball Salary Data: S-Plus
Red shows input, black shows output
attach(baseball)
dim(baseball)
[1] 337 18
library(rpart)
set.seed(123)
baseball.rp <- rpart(baseball[,1] ~., data = baseball[,c(2:12,14:17)], cp=0.00006, parms=list(split="gini"))
baseball.rp
n= 337
node), split, n, deviance, yval
* denotes terminal node
1) root 337 516644700.00 1248.5280
2) Runs< 46.5 188 84647780.00 606.2979
4) FAE< 0.5 135 18814190.00 370.9556
8) AE< 0.5 108 2747404.00 232.8981
16) Doubles< 8.5 59 465903.50 174.6441
32) RBI< 8.5 27 15680.74 127.5185 *
33) RBI>=8.5 32 339667.70 214.4063
66) SO>=23.5 18 123006.50 186.1667 *
67) SO< 23.5 14 183850.90 250.7143 *
17) Doubles>=8.5 49 1840202.00 303.0408
34) OBP< 0.3145 28 507538.90 246.4286
68) HR>=1.5 20 89509.75 193.2500 *
69) HR< 1.5 8 220071.90 379.3750 *
35) OBP>=0.3145 21 1123273.00 378.5238
70) Hits< 64 12 132508.70 252.3333 *
71) Hits>=64 9 544891.60 546.7778 *
9) AE>=0.5 27 5774466.00 923.1852
18) Errors< 1.5 12 987303.70 676.8333 *
19) Errors>=1.5 15 3476275.00 1120.2670 *
5) FAE>=0.5 53 39311010.00 1205.7550
10) Runs< 28.5 23 3061716.00 680.9130
20) OBP< 0.307 13 902380.80 577.6923 *
21) OBP>=0.307 10 1840765.00 815.1000 *
11) Runs>=28.5 30 25056490.00 1608.1330
22) Doubles>=11.5 19 9865468.00 1390.6840 *
23) Doubles< 11.5 11 12740840.00 1983.7270 *
3) Runs>=46.5 149 256615700.00 2058.8590
6) FAE< 0.5 68 89617800.00 1295.2500
12) AE< 0.5 30 680473.00 370.3667
24) SO>=72.5 21 323194.60 323.1429
48) Doubles< 23.5 8 17602.88 237.1250 *
49) Doubles>=23.5 13 209972.90 376.0769 *
25) SO< 72.5 9 201172.20 480.5556 *
13) AE>=0.5 38 43015360.00 2025.4210
26) RBI< 81.5 27 10515970.00 1612.7040
52) Hits< 122 7 2107224.00 1122.4290 *
53) Hits>=122 20 6137256.00 1784.3000
106) HR>=10.5 7 2073571.00 1510.7140 *
107) HR< 10.5 13 3257617.00 1931.6150 *
27) RBI>=81.5 11 16611720.00 3038.4550 *
7) FAE>=0.5 81 94060150.00 2699.9140
14) RBI< 64.5 33 26552130.00 2034.9090
28) Errors< 27.5 26 16521170.00 1781.7690
56) FA>=0.5 7 2097069.00 1101.2860 *
57) FA< 0.5 19 9988497.00 2032.4740 *
29) Errors>=27.5 7 2176603.00 2975.1430 *
15) RBI>=64.5 48 42881280.00 3157.1040
30) RBI< 94.5 34 15300490.00 2852.3530
60) SO>=85.5 13 2419177.00 2523.6150 *
61) SO< 85.5 21 10606730.00 3055.8570
122) Errors< 5 11 4819435.00 2776.4550 *
123) Errors>=5 10 3983978.00 3363.2000 *
31) RBI>=94.5 14 16754420.00 3897.2140 *
plotcp(baseball.rp)
printcp(baseball.rp)
Regression tree:
rpart(formula = baseball[, 1] ~ ., data = baseball[, c(2:12, 14:17)], parms = list(split =
"gini"), cp = 6e-005)
Variables actually used in tree construction:
[1] AE Doubles Errors FA FAE HR Hits OBP RBI Runs SO
Root node error: 5.1664e8/337 = 1.5331e6
n= 337
CP nsplit rel error xerror xstd xerror+xstd
1 0.339461963 0 1.00000 1.00551 0.089106 1.094616
2 0.141175826 1 0.66054 0.74414 0.066536 0.810676
3 0.088885006 2 0.51936 0.59435 0.063632 0.657982
4 0.051336213 3 0.43048 0.49638 0.057760 0.55414
5 0.047666680 4 0.37914 0.46016 0.054754 0.514914
6 0.030751631 5 0.33147 0.43612 0.051051 0.487171
7 0.021664422 6 0.30072 0.44210 0.051176 0.493276
8 0.020955164 7 0.27906 0.41008 0.050022 0.460102
9 0.019921462 8 0.25810 0.38939 0.047868 0.437258
10 0.015202617 9 0.23818 0.36378 0.047041 0.41082
11 0.008585410 10 0.22298 0.35825 0.048875 0.407125***
12 0.004742480 11 0.21439 0.36830 0.049617 0.417917
13 0.004402604 12 0.20965 0.39253 0.051714 0.444244
14 0.004396626 13 0.20525 0.39184 0.051711
15 0.003490446 14 0.20085 0.39245 0.052372
16 0.002537309 15 0.19736 0.40175 0.054013
17 0.001560197 16 0.19482 0.40682 0.053987
18 0.000854162 17 0.19326 0.40727 0.053980
19 0.000634152 18 0.19241 0.40867 0.053956
20 0.000616614 20 0.19114 0.40804 0.053965
21 0.000383159 21 0.19052 0.40783 0.053969
22 0.000302154 22 0.19014 0.40848 0.053959
23 0.000213987 23 0.18984 0.40842 0.053960
24 0.000185076 24 0.18963 0.40828 0.053963
25 0.000063507 25 0.18944 0.40828 0.053963
26 0.000060000 26 0.18938 0.40836 0.053961
plotcp(baseball.rp)
plot(baseball.rp, uniform=T, branch=0.1, margin=0.01);
text(baseball.rp, all=T, use.n=T, pretty=0, fancy=T, fwidth=7.5, fheight=0.8)
baseball.rp2 <- prune(baseball.rp, cp=0.01)
baseball.rp2
n= 337
node), split, n, deviance, yval
* denotes terminal node
1) root 337 516644700 1248.5280
2) Runs< 46.5 188 84647780 606.2979
4) FAE< 0.5 135 18814190 370.9556
8) AE< 0.5 108 2747404 232.8981 *
9) AE>=0.5 27 5774466 923.1852 *
5) FAE>=0.5 53 39311010 1205.7550
10) Runs< 28.5 23 3061716 680.9130 *
11) Runs>=28.5 30 25056490 1608.1330 *
3) Runs>=46.5 149 256615700 2058.8590
6) FAE< 0.5 68 89617800 1295.2500
12) AE< 0.5 30 680473 370.3667 *
13) AE>=0.5 38 43015360 2025.4210
26) RBI< 81.5 27 10515970 1612.7040 *
27) RBI>=81.5 11 16611720 3038.4550 *
7) FAE>=0.5 81 94060150 2699.9140
14) RBI< 64.5 33 26552130 2034.9090
28) Errors< 27.5 26 16521170 1781.7690 *
29) Errors>=27.5 7 2176603 2975.1430 *
15) RBI>=64.5 48 42881280 3157.1040
30) RBI< 94.5 34 15300490 2852.3530 *
31) RBI>=94.5 14 16754420 3897.2140 *
printcp(baseball.rp2)
Regression tree:
rpart(formula = baseball[, 1] ~ ., data = baseball[, c(2:12, 14:17)], parms = list(split =
"gini"), cp = 6e-005)
Variables actually used in tree construction:
[1] AE Errors FAE RBI Runs
Root node error: 5.1664e8/337 = 1.5331e6
n= 337
CP nsplit rel error xerror xstd
1 0.339462 0 1.00000 1.00551 0.089106
2 0.141176 1 0.66054 0.74414 0.066536
3 0.088885 2 0.51936 0.59435 0.063632
4 0.051336 3 0.43048 0.49638 0.057760
5 0.047667 4 0.37914 0.46016 0.054754
6 0.030752 5 0.33147 0.43612 0.051051
7 0.021664 6 0.30072 0.44210 0.051176
8 0.020955 7 0.27906 0.41008 0.050022
9 0.019921 8 0.25810 0.38939 0.047868
10 0.015203 9 0.23818 0.36378 0.047041
11 0.010000 10 0.22298 0.35825 0.048875