pdf("F:\\Rmisc\\graphs\\cruise_ship.pdf") # Route Graphics to a pdf file ####### set random number seed for reproducible sampling based methods set.seed(13579) cruise <- read.fwf("http://www.stat.ufl.edu/~winner/data/cruise_ship.dat", width=c(20,20,rep(8,7)), col.names=c("ship", "cline", "age", "tonnage", "passengers", "length", "cabins", "passdens", "crew")) attach(cruise) head(cruise) cor(cruise[,-c(1,2)]) install.packages("GGally") GGally::ggpairs(cruise[,-c(1,2)]) ####### Fit Full model fit0 <- lm(crew ~ age + tonnage + length + cabins + passdens) summary(fit0) anova(fit0) drop1(fit0,test="F") ######### Perform Backward Elimination, Forward Selection, and Stepwise Regression ######### Based on Model AIC (not individual regression coefficients) ######### fit1 and fit2 represent "extreme" models library(MASS) fit1 <- lm(crew ~ age + tonnage + length + cabins + passdens) fit2 <- lm(crew ~ 1) stepAIC(fit1,direction="backward") stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2)) stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2)) ########## Perform all possible regressions (aka all subset regressions) ########## Prints out best 4 models of each # of predictors install.packages("leaps") library(leaps) allcruise <- regsubsets(crew ~ age + tonnage + length + cabins + passdens, nbest=4,data=cruise) aprout <- summary(allcruise) n <- length(cruise$crew) p <- apply(aprout$which, 1, sum) aprout$aic <- aprout$bic - log(n) * p + 2 * p with(aprout,round(cbind(which,rsq,adjr2,cp,bic,aic),3)) ## Prints "readable" results plot(allcruise,scale="bic") plot(allcruise,scale="adjr2") ##### Re-fit the "best" model from Stepwise Regression ##### epred3 is the residual when the obs was not used in regression fit ##### epred3 = e3/(1-proj3) where proj3 are diagonal elements of PROJECTION matrix ##### cvpe = sum(epred3^2)/n = PRESS/n fit3 <- lm(crew ~ passdens + length + cabins) summary(fit3) anova(fit3) drop1(fit3,test="F") e3 <- residuals(fit3) proj3 <- hatvalues(fit3) epred3 <- e3/(1-proj3) (cvpe <- sum((epred3)^2)/length(e3)) ##### Cross-validation with a hold-out sample ##### Randomly sample 100 ships, fit model, obtain predictions for remaining 58 ships ##### by applying their X-levels to regression coefficients from model ##### Obtain "training" and "validation" sets cruise.cv.samp <- sample(1:length(crew),100,replace=FALSE) cruise.cv.in <- cruise[cruise.cv.samp,] cruise.cv.out <- cruise[-cruise.cv.samp,] ##### Fit model for training set fit.cv.in <- lm(crew ~ length + cabins + passdens, data=cruise.cv.in) summary(fit.cv.in) anova(fit.cv.in) drop1(fit.cv.in,test="F") ##### Obtain Predicted values and prediction errors for validation sample ##### Regression is based on same 3 predictors as fit3 (columns 6:8 of cruise) ##### Compute MSEP, Bias, Percent Bias, and Percent Error of Prediction pred.cv.out <- predict(fit.cv.in,cruise.cv.out[,6:8]) delta.cv.out <- crew[-cruise.cv.samp]-pred.cv.out (msep <- sum((delta.cv.out)^2)/length(crew[-cruise.cv.samp])) (bias2 <- (mean(delta.cv.out))^2) (pctbias <- 100*bias2/msep) (pcterrpred <- 100*sqrt(msep)/mean(crew[-cruise.cv.samp])) mean(delta.cv.out) sd(delta.cv.out) ##### Fit Model for Validation Sample fit.cv.out <- lm(crew ~ length + cabins + passdens, data=cruise.cv.out) summary(fit.cv.out) anova(fit.cv.out) drop1(fit.cv.out,test="F") ##### Perform k-fold cross-validation (fits 10 regressions, with 10 hold-out samples ##### Obtains avereage R^2 for hold-out samples and compares with full data fit install.packages("DAAG") library(DAAG) cruise.mod1 <- lm(crew ~ age + tonnage + length + cabins + passdens, data=cruise) CVlm(cruise, cruise.mod1, m=10) cruise.mod2 <- lm(crew ~ tonnage + length + cabins + passdens, data=cruise) CVlm(cruise, cruise.mod2, m=10) cruise.mod3 <- lm(crew ~ age + length + cabins + passdens, data=cruise) CVlm(cruise, cruise.mod3, m=10) cruise.mod4 <- lm(crew ~ length + cabins + passdens, data=cruise) CVlm(cruise, cruise.mod4, m=10) dev.off()