# pdf("F:\\sta4210\\APPC4.pdf") admin <- read.table("http://www.stat.ufl.edu/~winner/sta4210/data/APPC4.txt", header=F, col.names=c("ID","GPA","HSCR","ACT","Year")) attach(admin) install.packages("rpart") library(rpart) set.seed(12345) est.set <- sample(1:705, 352) val.set <- (1:705)[-est.set] admin1 <- admin[est.set,] (admintree1 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0001)) #par(mfrow=c(1,2)) plot(admintree1,margin=.10) text(admintree1) plot(admintree1,compress=T,uniform=T,branch=0.4,margin=.10) text(admintree1) printcp(admintree1) adminols1 <- lm(GPA ~ HSCR+ACT+I(HSCR^2)+I(ACT^2)+I(HSCR*ACT),admin1) summary(adminols1) #### See below for how 7 region model was selected (admintree7 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0091)) #par(mfrow=c(1,2)) plot(admintree7,margin=.10) text(admintree7) plot(admintree7,compress=T,uniform=T,branch=0.4,margin=.10) text(admintree7) (mse.tree.est <- sum((GPA[est.set]-predict(admintree7,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree7,admin[val.set,]))^2)/ length(val.set)) (mse.ols.est <- sum((GPA[est.set]-predict(adminols1,admin[est.set,]))^2)/ length(est.set)) (mspr.ols.val <- sum((GPA[val.set]-predict(adminols1,admin[val.set,]))^2)/ length(val.set)) #### This uses the 2-9 Region models to determine model with #### minimum MSPR for the validtion sample, which is 7, not the 5 #### in textbook which used different estimation and validation samples admintree2 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0456) admintree3 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0294) admintree4 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0172) admintree5 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0126) admintree6 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0115) admintree7 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0091) admintree8 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0087) admintree9 <- rpart(GPA ~ HSCR + ACT,admin1,method="anova",cp=.0068) (mse.tree.est <- sum((GPA[est.set]-predict(admintree2,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree2,admin[val.set,]))^2)/ length(val.set)) (mse.tree.est <- sum((GPA[est.set]-predict(admintree3,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree3,admin[val.set,]))^2)/ length(val.set)) (mse.tree.est <- sum((GPA[est.set]-predict(admintree4,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree4,admin[val.set,]))^2)/ length(val.set)) (mse.tree.est <- sum((GPA[est.set]-predict(admintree5,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree5,admin[val.set,]))^2)/ length(val.set)) (mse.tree.est <- sum((GPA[est.set]-predict(admintree6,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree6,admin[val.set,]))^2)/ length(val.set)) (mse.tree.est <- sum((GPA[est.set]-predict(admintree7,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree7,admin[val.set,]))^2)/ length(val.set)) (mse.tree.est <- sum((GPA[est.set]-predict(admintree8,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree8,admin[val.set,]))^2)/ length(val.set)) (mse.tree.est <- sum((GPA[est.set]-predict(admintree9,admin[est.set,]))^2)/ length(est.set)) (mspr.tree.val <- sum((GPA[val.set]-predict(admintree9,admin[val.set,]))^2)/ length(val.set)) # dev.off()