Section 9.3.5: CART for Baseball Salary Data: S-Plus

Red shows input, black shows output

attach(baseball)

dim(baseball)

[1] 337 18

library(rpart)

set.seed(123)

baseball.rp <- rpart(baseball[,1] ~., data = baseball[,c(2:12,14:17)], cp=0.00006, parms=list(split="gini"))

baseball.rp

n= 337

node), split, n, deviance, yval

* denotes terminal node

1) root 337 516644700.00 1248.5280

2) Runs< 46.5 188 84647780.00 606.2979

4) FAE< 0.5 135 18814190.00 370.9556

8) AE< 0.5 108 2747404.00 232.8981

16) Doubles< 8.5 59 465903.50 174.6441

32) RBI< 8.5 27 15680.74 127.5185 *

33) RBI>=8.5 32 339667.70 214.4063

66) SO>=23.5 18 123006.50 186.1667 *

67) SO< 23.5 14 183850.90 250.7143 *

17) Doubles>=8.5 49 1840202.00 303.0408

34) OBP< 0.3145 28 507538.90 246.4286

68) HR>=1.5 20 89509.75 193.2500 *

69) HR< 1.5 8 220071.90 379.3750 *

35) OBP>=0.3145 21 1123273.00 378.5238

70) Hits< 64 12 132508.70 252.3333 *

71) Hits>=64 9 544891.60 546.7778 *

9) AE>=0.5 27 5774466.00 923.1852

18) Errors< 1.5 12 987303.70 676.8333 *

19) Errors>=1.5 15 3476275.00 1120.2670 *

5) FAE>=0.5 53 39311010.00 1205.7550

10) Runs< 28.5 23 3061716.00 680.9130

20) OBP< 0.307 13 902380.80 577.6923 *

21) OBP>=0.307 10 1840765.00 815.1000 *

11) Runs>=28.5 30 25056490.00 1608.1330

22) Doubles>=11.5 19 9865468.00 1390.6840 *

23) Doubles< 11.5 11 12740840.00 1983.7270 *

3) Runs>=46.5 149 256615700.00 2058.8590

6) FAE< 0.5 68 89617800.00 1295.2500

12) AE< 0.5 30 680473.00 370.3667

24) SO>=72.5 21 323194.60 323.1429

48) Doubles< 23.5 8 17602.88 237.1250 *

49) Doubles>=23.5 13 209972.90 376.0769 *

25) SO< 72.5 9 201172.20 480.5556 *

13) AE>=0.5 38 43015360.00 2025.4210

26) RBI< 81.5 27 10515970.00 1612.7040

52) Hits< 122 7 2107224.00 1122.4290 *

53) Hits>=122 20 6137256.00 1784.3000

106) HR>=10.5 7 2073571.00 1510.7140 *

107) HR< 10.5 13 3257617.00 1931.6150 *

27) RBI>=81.5 11 16611720.00 3038.4550 *

7) FAE>=0.5 81 94060150.00 2699.9140

14) RBI< 64.5 33 26552130.00 2034.9090

28) Errors< 27.5 26 16521170.00 1781.7690

56) FA>=0.5 7 2097069.00 1101.2860 *

57) FA< 0.5 19 9988497.00 2032.4740 *

29) Errors>=27.5 7 2176603.00 2975.1430 *

15) RBI>=64.5 48 42881280.00 3157.1040

30) RBI< 94.5 34 15300490.00 2852.3530

60) SO>=85.5 13 2419177.00 2523.6150 *

61) SO< 85.5 21 10606730.00 3055.8570

122) Errors< 5 11 4819435.00 2776.4550 *

123) Errors>=5 10 3983978.00 3363.2000 *

31) RBI>=94.5 14 16754420.00 3897.2140 *

plotcp(baseball.rp)

printcp(baseball.rp)

Regression tree:

rpart(formula = baseball[, 1] ~ ., data = baseball[, c(2:12, 14:17)], parms = list(split =

"gini"), cp = 6e-005)

Variables actually used in tree construction:

[1] AE Doubles Errors FA FAE HR Hits OBP RBI Runs SO

Root node error: 5.1664e8/337 = 1.5331e6

n= 337

CP nsplit rel error xerror xstd xerror+xstd

1 0.339461963 0 1.00000 1.00551 0.089106 1.094616

2 0.141175826 1 0.66054 0.74414 0.066536 0.810676

3 0.088885006 2 0.51936 0.59435 0.063632 0.657982

4 0.051336213 3 0.43048 0.49638 0.057760 0.55414

5 0.047666680 4 0.37914 0.46016 0.054754 0.514914

6 0.030751631 5 0.33147 0.43612 0.051051 0.487171

7 0.021664422 6 0.30072 0.44210 0.051176 0.493276

8 0.020955164 7 0.27906 0.41008 0.050022 0.460102

9 0.019921462 8 0.25810 0.38939 0.047868 0.437258

10 0.015202617 9 0.23818 0.36378 0.047041 0.41082

11 0.008585410 10 0.22298 0.35825 0.048875 0.407125***

12 0.004742480 11 0.21439 0.36830 0.049617 0.417917

13 0.004402604 12 0.20965 0.39253 0.051714 0.444244

14 0.004396626 13 0.20525 0.39184 0.051711

15 0.003490446 14 0.20085 0.39245 0.052372

16 0.002537309 15 0.19736 0.40175 0.054013

17 0.001560197 16 0.19482 0.40682 0.053987

18 0.000854162 17 0.19326 0.40727 0.053980

19 0.000634152 18 0.19241 0.40867 0.053956

20 0.000616614 20 0.19114 0.40804 0.053965

21 0.000383159 21 0.19052 0.40783 0.053969

22 0.000302154 22 0.19014 0.40848 0.053959

23 0.000213987 23 0.18984 0.40842 0.053960

24 0.000185076 24 0.18963 0.40828 0.053963

25 0.000063507 25 0.18944 0.40828 0.053963

26 0.000060000 26 0.18938 0.40836 0.053961

plotcp(baseball.rp)

plot(baseball.rp, uniform=T, branch=0.1, margin=0.01);

text(baseball.rp, all=T, use.n=T, pretty=0, fancy=T, fwidth=7.5, fheight=0.8)

baseball.rp2 <- prune(baseball.rp, cp=0.01)

baseball.rp2

n= 337

node), split, n, deviance, yval

* denotes terminal node

1) root 337 516644700 1248.5280

2) Runs< 46.5 188 84647780 606.2979

4) FAE< 0.5 135 18814190 370.9556

8) AE< 0.5 108 2747404 232.8981 *

9) AE>=0.5 27 5774466 923.1852 *

5) FAE>=0.5 53 39311010 1205.7550

10) Runs< 28.5 23 3061716 680.9130 *

11) Runs>=28.5 30 25056490 1608.1330 *

3) Runs>=46.5 149 256615700 2058.8590

6) FAE< 0.5 68 89617800 1295.2500

12) AE< 0.5 30 680473 370.3667 *

13) AE>=0.5 38 43015360 2025.4210

26) RBI< 81.5 27 10515970 1612.7040 *

27) RBI>=81.5 11 16611720 3038.4550 *

7) FAE>=0.5 81 94060150 2699.9140

14) RBI< 64.5 33 26552130 2034.9090

28) Errors< 27.5 26 16521170 1781.7690 *

29) Errors>=27.5 7 2176603 2975.1430 *

15) RBI>=64.5 48 42881280 3157.1040

30) RBI< 94.5 34 15300490 2852.3530 *

31) RBI>=94.5 14 16754420 3897.2140 *

printcp(baseball.rp2)

Regression tree:

rpart(formula = baseball[, 1] ~ ., data = baseball[, c(2:12, 14:17)], parms = list(split =

"gini"), cp = 6e-005)

Variables actually used in tree construction:

[1] AE Errors FAE RBI Runs

Root node error: 5.1664e8/337 = 1.5331e6

n= 337

CP nsplit rel error xerror xstd

1 0.339462 0 1.00000 1.00551 0.089106

2 0.141176 1 0.66054 0.74414 0.066536

3 0.088885 2 0.51936 0.59435 0.063632

4 0.051336 3 0.43048 0.49638 0.057760

5 0.047667 4 0.37914 0.46016 0.054754

6 0.030752 5 0.33147 0.43612 0.051051

7 0.021664 6 0.30072 0.44210 0.051176

8 0.020955 7 0.27906 0.41008 0.050022

9 0.019921 8 0.25810 0.38939 0.047868

10 0.015203 9 0.23818 0.36378 0.047041

11 0.010000 10 0.22298 0.35825 0.048875