[Previously saved workspace restored] > pdf("F:\\Rmisc\\graphs\\cruise_ship.pdf") # Route Graphics to a pdf file > > ####### set random number seed for reproducible sampling based methods > set.seed(13579) > > cruise <- read.fwf("http://www.stat.ufl.edu/~winner/data/cruise_ship.dat", width=c(20,20,rep(8,7)), + col.names=c("ship", "cline", "age", "tonnage", "passengers", "length", "cabins", "passdens", "crew")) > > attach(cruise) > > ####### Fit Full model > > fit0 <- lm(crew ~ age + tonnage + passengers + length + cabins + passdens) > summary(fit0) Call: lm(formula = crew ~ age + tonnage + passengers + length + cabins + passdens) Residuals: Min 1Q Median 3Q Max -1.7700 -0.4881 -0.0938 0.4454 7.0077 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.5213400 1.0570350 -0.493 0.62258 age -0.0125449 0.0141975 -0.884 0.37832 tonnage 0.0132410 0.0118928 1.113 0.26732 passengers -0.1497640 0.0475886 -3.147 0.00199 ** length 0.4034785 0.1144548 3.525 0.00056 *** cabins 0.8016337 0.0892227 8.985 9.84e-16 *** passdens -0.0006577 0.0158098 -0.042 0.96687 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.9819 on 151 degrees of freedom Multiple R-squared: 0.9245, Adjusted R-squared: 0.9215 F-statistic: 308 on 6 and 151 DF, p-value: < 2.2e-16 > anova(fit0) Analysis of Variance Table Response: crew Df Sum Sq Mean Sq F value Pr(>F) age 1 542.66 542.66 562.8982 < 2.2e-16 *** tonnage 1 1118.50 1118.50 1160.2189 < 2.2e-16 *** passengers 1 24.17 24.17 25.0735 1.521e-06 *** length 1 16.62 16.62 17.2387 5.492e-05 *** cabins 1 79.56 79.56 82.5234 5.450e-16 *** passdens 1 0.00 0.00 0.0017 0.9669 Residuals 151 145.57 0.96 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > > ######### Perform Backward Elimination, Forward Selection, and Stepwise Regression > ######### Based on Model AIC (not individual regression coefficients) > ######### fit1 and fit2 represent "extreme" models > > library(MASS) > fit1 <- lm(crew ~ age + tonnage + passengers + length + cabins + passdens) > fit2 <- lm(crew ~ 1) > stepAIC(fit1,direction="backward") Start: AIC=1.05 crew ~ age + tonnage + passengers + length + cabins + passdens Df Sum of Sq RSS AIC - passdens 1 0.002 145.57 -0.943 - age 1 0.753 146.32 -0.130 - tonnage 1 1.195 146.77 0.347 145.57 1.055 - passengers 1 9.548 155.12 9.092 - length 1 11.980 157.55 11.551 - cabins 1 77.821 223.39 66.721 Step: AIC=-0.94 crew ~ age + tonnage + passengers + length + cabins Df Sum of Sq RSS AIC - age 1 0.815 146.39 -2.062 145.57 -0.943 - tonnage 1 2.007 147.58 -0.780 - length 1 12.069 157.64 9.641 - passengers 1 14.027 159.60 11.591 - cabins 1 79.556 225.13 65.944 Step: AIC=-2.06 crew ~ tonnage + passengers + length + cabins Df Sum of Sq RSS AIC 146.39 -2.062 - tonnage 1 3.866 150.25 0.056 - length 1 11.739 158.13 8.126 - passengers 1 14.275 160.66 10.640 - cabins 1 78.861 225.25 64.028 Call: lm(formula = crew ~ tonnage + passengers + length + cabins) Coefficients: (Intercept) tonnage passengers length cabins -0.81871 0.01632 -0.14985 0.39755 0.79084 > stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2)) Start: AIC=397.18 crew ~ 1 Df Sum of Sq RSS AIC + cabins 1 1742.21 184.88 28.82 + tonnage 1 1658.03 269.05 88.10 + passengers 1 1614.23 312.86 111.94 + length 1 1546.60 380.49 142.86 + age 1 542.66 1384.42 346.93 + passdens 1 46.60 1880.48 395.32 1927.08 397.18 Step: AIC=28.82 crew ~ cabins Df Sum of Sq RSS AIC + length 1 22.9636 161.91 9.8661 + passdens 1 14.9541 169.92 17.4948 + tonnage 1 12.5135 172.36 19.7480 + passengers 1 7.0656 177.81 24.6647 + age 1 5.4442 179.43 26.0989 184.88 28.8215 Step: AIC=9.87 crew ~ cabins + length Df Sum of Sq RSS AIC + passengers 1 11.6609 150.25 0.0565 + passdens 1 6.3732 155.54 5.5212 161.91 9.8661 + age 1 1.9702 159.94 9.9317 + tonnage 1 1.2514 160.66 10.6402 Step: AIC=0.06 crew ~ cabins + length + passengers Df Sum of Sq RSS AIC + tonnage 1 3.8656 146.39 -2.06164 + age 1 2.6733 147.58 -0.77996 + passdens 1 2.5635 147.69 -0.66241 150.25 0.05650 Step: AIC=-2.06 crew ~ cabins + length + passengers + tonnage Df Sum of Sq RSS AIC 146.39 -2.06164 + age 1 0.81467 145.57 -0.94339 + passdens 1 0.06366 146.32 -0.13037 Call: lm(formula = crew ~ cabins + length + passengers + tonnage) Coefficients: (Intercept) cabins length passengers tonnage -0.81871 0.79084 0.39755 -0.14985 0.01632 > stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2)) Start: AIC=397.18 crew ~ 1 Df Sum of Sq RSS AIC + cabins 1 1742.21 184.88 28.82 + tonnage 1 1658.03 269.05 88.10 + passengers 1 1614.23 312.86 111.94 + length 1 1546.60 380.49 142.86 + age 1 542.66 1384.42 346.93 + passdens 1 46.60 1880.48 395.32 1927.08 397.18 Step: AIC=28.82 crew ~ cabins Df Sum of Sq RSS AIC + length 1 22.96 161.91 9.87 + passdens 1 14.95 169.92 17.49 + tonnage 1 12.51 172.36 19.75 + passengers 1 7.07 177.81 24.66 + age 1 5.44 179.43 26.10 184.88 28.82 - cabins 1 1742.21 1927.08 397.18 Step: AIC=9.87 crew ~ cabins + length Df Sum of Sq RSS AIC + passengers 1 11.661 150.25 0.056 + passdens 1 6.373 155.54 5.521 161.91 9.866 + age 1 1.970 159.94 9.932 + tonnage 1 1.251 160.66 10.640 - length 1 22.964 184.88 28.821 - cabins 1 218.571 380.49 142.859 Step: AIC=0.06 crew ~ cabins + length + passengers Df Sum of Sq RSS AIC + tonnage 1 3.866 146.39 -2.062 + age 1 2.673 147.58 -0.780 + passdens 1 2.563 147.69 -0.662 150.25 0.056 - passengers 1 11.661 161.91 9.866 - length 1 27.559 177.81 24.665 - cabins 1 95.781 246.03 75.974 Step: AIC=-2.06 crew ~ cabins + length + passengers + tonnage Df Sum of Sq RSS AIC 146.39 -2.062 + age 1 0.815 145.57 -0.943 + passdens 1 0.064 146.32 -0.130 - tonnage 1 3.866 150.25 0.056 - length 1 11.739 158.13 8.126 - passengers 1 14.275 160.66 10.640 - cabins 1 78.861 225.25 64.028 Call: lm(formula = crew ~ cabins + length + passengers + tonnage) Coefficients: (Intercept) cabins length passengers tonnage -0.81871 0.79084 0.39755 -0.14985 0.01632 > > ########## Perform all possible regressions (aka all subset regressions) > ########## Prints out best 4 models of each # of predictors > > install.packages("leaps") Installing package(s) into ‘C:/Users/winner/Documents/R/win-library/2.15’ (as ‘lib’ is unspecified) --- Please select a CRAN mirror for use in this session --- trying URL 'http://mirrors.nics.utk.edu/cran/bin/windows/contrib/2.15/leaps_2.9.zip' Content type 'application/zip' length 268778 bytes (262 Kb) opened URL downloaded 262 Kb package ‘leaps’ successfully unpacked and MD5 sums checked The downloaded binary packages are in C:\Users\winner\AppData\Local\Temp\Rtmp27Zc2i\downloaded_packages > library(leaps) Warning message: package ‘leaps’ was built under R version 2.15.3 > > allcruise <- regsubsets(crew ~ age + tonnage + passengers + length + cabins + passdens, + nbest=4,data=cruise) > aprout <- summary(allcruise) > with(aprout,round(cbind(which,rsq,adjr2,cp,bic),3)) ## Prints "readable" results (Intercept) age tonnage passengers length cabins passdens rsq adjr2 cp 1 1 0 0 0 0 1 0 0.904 0.903 37.772 1 1 0 1 0 0 0 0 0.860 0.859 125.086 1 1 0 0 1 0 0 0 0.838 0.837 170.523 1 1 0 0 0 1 0 0 0.803 0.801 240.675 2 1 0 0 0 1 1 0 0.916 0.915 15.952 2 1 0 0 0 0 1 1 0.912 0.911 24.261 2 1 0 1 0 0 1 0 0.911 0.909 26.792 2 1 0 0 1 0 1 0 0.908 0.907 32.443 3 1 0 0 1 1 1 0 0.922 0.921 5.857 3 1 0 0 0 1 1 1 0.919 0.918 11.341 3 1 0 1 1 0 1 0 0.918 0.916 14.023 3 1 1 0 0 1 1 0 0.917 0.915 15.909 4 1 0 1 1 1 1 0 0.924 0.922 3.847 4 1 1 0 1 1 1 0 0.923 0.921 5.084 4 1 0 0 1 1 1 1 0.923 0.921 5.197 4 1 0 1 0 1 1 1 0.919 0.917 13.056 5 1 1 1 1 1 1 0 0.924 0.922 5.002 5 1 0 1 1 1 1 1 0.924 0.922 5.781 5 1 1 0 1 1 1 1 0.924 0.921 6.240 5 1 1 1 0 1 1 1 0.920 0.917 14.904 6 1 1 1 1 1 1 1 0.924 0.921 7.000 bic 1 -360.238 1 -300.954 1 -277.122 1 -246.201 2 -376.131 2 -368.502 2 -366.249 2 -361.332 3 -382.878 3 -377.413 3 -374.808 3 -373.002 4 -381.933 4 -380.652 4 -380.534 4 -372.631 5 -377.752 5 -376.939 5 -376.462 5 -367.717 6 -372.692 > plot(allcruise,scale="bic") > plot(allcruise,scale="adjr2") > > ##### Re-fit the "best" model from Stepwise Regression > ##### epred3 is the residual when the obs was not used in regression fit > ##### epred3 = e3/(1-proj3) where proj3 are diagonal elements of PROJECTION matrix > ##### cvpe = sum(epred3^2)/n = PRESS/n > > fit3 <- lm(crew ~ tonnage + passengers + length + cabins) > summary(fit3) Call: lm(formula = crew ~ tonnage + passengers + length + cabins) Residuals: Min 1Q Median 3Q Max -1.7593 -0.4639 -0.0716 0.4698 7.0239 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.818710 0.584866 -1.400 0.163590 tonnage 0.016319 0.008119 2.010 0.046185 * passengers -0.149851 0.038795 -3.863 0.000165 *** length 0.397554 0.113499 3.503 0.000604 *** cabins 0.790837 0.087109 9.079 5.18e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.9782 on 153 degrees of freedom Multiple R-squared: 0.924, Adjusted R-squared: 0.9221 F-statistic: 465.3 on 4 and 153 DF, p-value: < 2.2e-16 > anova(fit3) Analysis of Variance Table Response: crew Df Sum Sq Mean Sq F value Pr(>F) tonnage 1 1658.03 1658.03 1732.930 < 2.2e-16 *** passengers 1 26.90 26.90 28.119 3.925e-07 *** length 1 16.90 16.90 17.663 4.468e-05 *** cabins 1 78.86 78.86 82.424 5.177e-16 *** Residuals 153 146.39 0.96 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > e3 <- residuals(fit3) > proj3 <- hatvalues(fit3) > epred3 <- e3/(1-proj3) > (cvpe <- sum((epred3)^2)/length(e3)) [1] 0.9800502 > > ##### Cross-validation with a hold-out sample > ##### Randomly sample 100 ships, fit model, obtain predictions for remaining 58 ships > ##### by applying their X-levels to regression coefficients from model > > ##### Obtain "training" and "validation" sets > > cruise.cv.samp <- sample(1:length(crew),100,replace=FALSE) > cruise.cv.in <- cruise[cruise.cv.samp,] > cruise.cv.out <- cruise[-cruise.cv.samp,] > > ##### Fit model for training set > > fit.cv.in <- lm(crew ~tonnage + passengers + length + cabins, + data=cruise.cv.in) > summary(fit.cv.in) Call: lm(formula = crew ~ tonnage + passengers + length + cabins, data = cruise.cv.in) Residuals: Min 1Q Median 3Q Max -1.6837 -0.4636 -0.0224 0.3040 6.8875 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.10180 0.77347 -1.424 0.157581 tonnage 0.00479 0.01177 0.407 0.685054 passengers -0.19192 0.05445 -3.525 0.000654 *** length 0.45647 0.14573 3.132 0.002306 ** cabins 0.95060 0.14510 6.551 2.92e-09 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.059 on 95 degrees of freedom Multiple R-squared: 0.9203, Adjusted R-squared: 0.9169 F-statistic: 274.2 on 4 and 95 DF, p-value: < 2.2e-16 > anova(fit.cv.in) Analysis of Variance Table Response: crew Df Sum Sq Mean Sq F value Pr(>F) tonnage 1 1159.23 1159.23 1032.793 < 2.2e-16 *** passengers 1 11.28 11.28 10.046 0.002055 ** length 1 12.43 12.43 11.076 0.001245 ** cabins 1 48.17 48.17 42.919 2.923e-09 *** Residuals 95 106.63 1.12 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > ##### Obtain Predicted values and prediction errors for validation sample > ##### Regression is based on same 4 predictors as fit3 (columns 4:7 of cruise) > ##### Compute MSEP > > pred.cv.out <- predict(fit.cv.in,cruise.cv.out[,4:7]) > delta.cv.out <- crew[-cruise.cv.samp]-pred.cv.out > (msep <- sum((delta.cv.out)^2)/length(crew[-cruise.cv.samp])) [1] 0.7578447