### Chapter 2 ### Example 2.1 ## Read data off web page, attach file as data frame, and list variable names clt2016 <- read.csv("http://www.stat.ufl.edu/~winner/data/trafficstop.csv") attach(clt2016); names(clt2016) head(clt2016) ## Print first 6 observations ## Assign labels to the Catgories of Reasons for Stop labels.RsnStop <- c("ChkPnt","DWI","Invstgtn","Other", "SafeMove","SeatBelt","Speed","StopLgtSgn","VhclMove","Rgstrtn") ## Obtain and print frequency table for Reasons for Stop (table.RsnStop <- table(RsnStop)) ## Pie chart based on Table and Labels from above pie(table.RsnStop, labels.RsnStop, main="Pie Chart - CLT Traffic Stops") ## Bar chart based on Table and Labels from above (cex shrinks size of levels) barplot(table.RsnStop, names.arg=labels.RsnStop, main="Bar Chart - CLT Traffic Stops", xlab="Reason", ylab="Frequency", cex.names=0.6) ### Example 2.2 rm(list=ls(all=TRUE)) ### Read data and set up data frame nhl <- read.csv("http://www.stat.ufl.edu/~winner/data/nhl_ht_wt.csv") attach(nhl); names(nhl) ### Generate random values (-0.5 to 0.5) to add to Height set.seed(1234) N <- NROW(nhl) Height.dev <- 0.5 - runif(N) Height <- Height + Height.dev ### Compute BMI bmi.nhl <- 703 * Weight / (Height^2) ### Obtain histogram hist(bmi.nhl, breaks=30, xlab="Body Mass Index", main="NHL BMI Distribution 2013-2014 Season") ### Examples 2.3 and 2.5 (Rock and Roll Marathon) rm(list=ls(all=TRUE)) ## Read data from website and attach data frame and obain variable names rr.mar <- read.csv( "http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv") attach(rr.mar); names(rr.mar) ## Obtain mean and standard deviation by gender tapply(mph,Gender,mean) tapply(mph,Gender,median) tapply(mph,Gender,var) tapply(mph,Gender,sd) ## Obtain the densities (for plotting) of mph by gender d.F <- density(mph[Gender=="F"]) d.M <- density(mph[Gender=="M"]) ## Set up a 2x2 grid for plots par(mfrow=c(2,2)) ## Histograms for Female and Male mph hist(mph[Gender=="F"],breaks=25,main="Histogram of Female Speeds", xlab="Female Speeds") hist(mph[Gender=="M"],breaks=25,main="Histogram of Male Speeds", xlab="Male Speeds") ## Density Plots for Female and Male mph plot(d.F, main="Kernel Density Plot of Female Speeds") plot(d.M, main="Kernel Density Plot of Male Speeds") ## Reset Plot to 1 per page and obtain side-by-side boxplots ## Gender is a factor vatiable (on the x-axis) par(mfrow=c(1,1)) plot(Gender, mph, main="Box Plots of Speed(mph) by Gender") ## Obtain a "violin plot" - a "smoothed density" version of boxplot install.packages("ggplot2") require(ggplot2) ggplot(rr.mar, aes(y=mph, x=Gender)) + geom_violin() ### Miami Weather Plots rm(list=ls(all=TRUE)) ## Read data and set up data frame mw1 <- read.csv("http://www.stat.ufl.edu/~winner/data/miami_weather.csv") attach(mw1); names(mw1) ## Obtain mean temperature by year (yearMeanTemp <- aggregate(meantemp ~ year, mw1, mean)) ## Stack Monthly and Annual plots par(mfrow=c(2,1)) ## Monthly Plot gives only "y", not "x", this is a line plot ## type="l" draws lines meeting points plot(meantemp, type="l", main="Miami Monthly Mean Temp (F) 1949-2014", xlab="Month", ylab="Mean Temperature") ## Plot "x"=Year (first column of yearMeanTemp) and ## "y"=mean temp (second column of yearMeanTemp0 plot(yearMeanTemp[,1], yearMeanTemp[,2], type="l", main="Miami Yearly Mean Temp (F) 1949-2014", xlab="Year", ylab="Mean Temperature") ### Bigfoot Map rm(list=ls(all=TRUE)) bigfoot <- read.csv("http://www.stat.ufl.edu/~winner/data/bigfoot_state.csv", header=TRUE) attach(bigfoot); names(bigfoot) install.packages("fiftystater") library(ggplot2) library(fiftystater) data("fifty_states") # this line is optional due to lazy data loading # map_id creates the aesthetic mapping to the state name column in your data p <- ggplot(bigfoot, aes(map_id = tolower(State))) + # map points to the fifty_states shape data geom_map(aes(fill = Bigfoot), map = fifty_states) + expand_limits(x = fifty_states$long, y = fifty_states$lat) + coord_map() + labs(title="Bigfoot Sightings") + scale_x_continuous(breaks = NULL) + scale_y_continuous(breaks = NULL) + labs(x = "", y = "") + theme(legend.position = "bottom", panel.background = element_blank()) p # add border boxes to AK/HI p + fifty_states_inset_boxes() ## Bivariate pplot library(colorplaner) p + aes(fill2 = Pop.m) + scale_fill_colorplane() + theme(legend.position = "right") ### Example 2.4 rm(list=ls(all=TRUE)) ## Read data off web page, attach file as data frame, and list variable names clt2016 <- read.csv("http://www.stat.ufl.edu/~winner/data/trafficstop.csv") attach(clt2016); names(clt2016) (table.RsnStop <- table(RsnStop)) round(table.RsnStop / sum(table.RsnStop), 5) round(cbind(propRsnStop),5) ### Example 2.5 (NHL BMI portion) rm(list=ls(all=TRUE)) ### Read data and set up data frame nhl <- read.csv("http://www.stat.ufl.edu/~winner/data/nhl_ht_wt.csv") attach(nhl); names(nhl) ### Generate random values (-0.5 to 0.5) to add to Height set.seed(1234) N <- NROW(nhl) Height.dev <- 0.5 - runif(N) Height <- Height + Height.dev ### Compute BMI bmi.nhl <- 703 * Weight / (Height^2) ### obtain the population size from number of rows of data frame (N <- NROW(nhl)) ### obtain the total of the BMI values (sum.BMI <- sum(bmi.nhl)) ### mean = sum / N sum.BMI/N ### Use built-in mean function mean(bmi.nhl) ### Obtain sorted bmi's bmi.nhl.sort <- sort(bmi.nhl) ### Print first few cases to confirm ordered head(bmi.nhl.sort) ### If N is even, average middle 2 cases, otherwise take middle case ifelse(N%%2==0,(bmi.nhl.sort[N/2]+bmi.nhl.sort[N/2+1])/2, bmi.nhl.sort[(N+1)/2]) ### Use built-in median function median(bmi.nhl) ### Examples 2.6-2.8 (NHL BMI portion) rm(list=ls(all=TRUE)) ### Read data and set up data frame nhl <- read.csv("http://www.stat.ufl.edu/~winner/data/nhl_ht_wt.csv") attach(nhl); names(nhl) ### Generate random values (-0.5 to 0.5) to add to Height set.seed(1234) N <- NROW(nhl) Height.dev <- 0.5 - runif(N) Height <- Height + Height.dev ### Compute BMI bmi.nhl <- 703 * Weight / (Height^2) (bmi.max <- max(bmi.nhl)) # Highest BMI (bmi.min <- min(bmi.nhl)) # Lowest BMI (range <- bmi.max - bmi.min) # Compute Range (bmi.75 <- quantile(bmi.nhl,.75)) # BMI 75%-ile (bmi.25 <- quantile(bmi.nhl,.25)) # BMI 25%-ile (IQR <- bmi.75 - bmi.25) # Comute IQR (N <- length(bmi.nhl)) # Use "length" function to get N (mu <- mean(bmi.nhl)) # Use "mean" function to get mu (sum.dev2 <- sum((bmi.nhl - mu)^2)) # Numerator of Variance (sigma2 <- sum.dev2/N) # Population Variance (s2 <- sum.dev2/(N-1)) # Sample Variance (sigma <- sqrt(sigma2)) # Population Standard Deviation (s <- sqrt(s2)) # Sample Standard Deviation var(bmi.nhl) # Sample Variance with "var" function (N-1)*var(bmi.nhl)/N # Pop variance with "var" function sd(bmi.nhl) # Sample Std Dev with "sd" function sqrt((N-1)/N)*sd(bmi.nhl) # Population Std Dev with "sd" function ## Proportion of Individual w/in 1 and 2 SDs of mean sum(bmi.nhl >= mu-sigma & bmi.nhl <= mu+sigma) / N sum(bmi.nhl >= mu-2*sigma & bmi.nhl <= mu+2*sigma) / N (mad <- median(abs(bmi.nhl - mu))) # Median absolute deviation mad/0.645 # Approximating sigma (cv <- sigma/mu) # Coefficient of Variation (mu3 <- (sum((bmi.nhl-mu)^3)/N)) (skew <- mu3/(sigma^3)) (mu4 <- (sum((bmi.nhl-mu)^4)/N)) (kurt <- mu4/(sigma^4)-3) ### Examples 2.6-2.8 (Rock and Roll Marathon portion) rm(list=ls(all=TRUE)) ## Read data from website and attach data frame and obain variable names rr.mar <- read.csv( "http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv") attach(rr.mar); names(rr.mar) f.mph <- mph[Gender=="F"] (N.f <- length(f.mph)) (mean.f <- mean(f.mph)) (var.f <- var(f.mph)) (sd.f <- sd(f.mph)) sum(f.mph >= mean.f - sd.f & f.mph <= mean.f + sd.f) / N.f sum(f.mph >= mean.f - 2*sd.f & f.mph <= mean.f + 2*sd.f) / N.f (cv.f <- sd.f/mean.f) (mad.f <- median(abs(f.mph-mean.f))) (m3.f <- sum((f.mph-mean.f)^3)/N.f) (skew.f <- m3.f / sd.f^3) (m4.f <- sum((f.mph-mean.f)^4)/N.f) (kurt.f <- (m4.f/sd.f^4)-3) m.mph <- mph[Gender=="M"] (N.m <- length(m.mph)) (mean.m <- mean(m.mph)) (var.m <- var(m.mph)) (sd.m <- sd(m.mph)) sum(m.mph >= mean.m - sd.m & m.mph <= mean.m + sd.m) / N.m sum(m.mph >= mean.m - 2*sd.m & m.mph <= mean.m + 2*sd.m) / N.m (cv.m <- sd.m/mean.m) (mad.m <- median(abs(m.mph-mean.m))) (m3.m <- sum((m.mph-mean.m)^3)/N.m) (skew.m <- m3.m / sd.m^3) (m4.m <- sum((m.mph-mean.m)^4)/N.m) (kurt.m <- (m4.m/sd.m^4)-3) ### Example 2.9 rm(list=ls(all=TRUE)) ## Read data off web page, attach file as data frame, and list variable names clt2016 <- read.csv("http://www.stat.ufl.edu/~winner/data/trafficstop.csv") attach(clt2016); names(clt2016) head(clt2016) ## Print first 6 observations ## Make OffMale and DrvMale "factor variables" and give labels for 0/1 OffMale <- factor(OffMale, labels=c("OfficerF","OfficerM")) DrvMale <- factor(DrvMale, labels=c("DriverF","DriverM")) ## Obtain Table of Counts (Row=Officer, Column=Driver) (omdm <- table(OffMale,DrvMale)) ## Obtain Row (1) and Column (2) Marginal Totals margin.table(omdm,1) margin.table(omdm,2) ## Obtain Proportions across all Cells omdm/sum(omdm) ## Obtain Row Proportions (Driver Genders w/in Officer Gender) prop.table(omdm,1) ## Obtain Column Proportions (Officer Genders w/in Driver Gender) prop.table(omdm,2) ## Obtain Cluster (Grouped) and Stacked Bar Plots ## t(prop.table(omdm,1)) takes transpose so that group var is Officer par(mfrow=c(1,2)) barplot(t(prop.table(omdm,1)),beside=T,legend=colnames(omdm),ylim=c(0,1), main="Grouped Bar Plot - CLT Traffic Stops") barplot(t(prop.table(omdm,1)),beside=F,legend=colnames(omdm), main="Stacked Bar Plot - CLT Traffic Stops") install.packages("vcd") library(vcd) table(RsnStop,OffMale,DrvMale, data=clt2016) mosaic(~RsnStop+OffMale+DrvMale, data=clt2016, shade=TRUE, legend=TRUE) ### Example 2.10 rm(list=ls(all=TRUE)) ## Read data from website and attach data frame and obain variable names rr.mar <- read.csv( "http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv") attach(rr.mar); names(rr.mar) ## Obtain the densities (for plotting) of mph by gender d.F <- density(mph[Gender=="F"]) d.M <- density(mph[Gender=="M"]) ## Density Plots for Female and Male mph plot(d.F,xlim=c(4,11),xlab="Speed(mph)",ylab="Density", main="Kernel Density Plot of Marathon Speeds") lines(d.M,lty=2) legend(9,0.45,c("Females","Males"),lty=c(1,2)) ## Combined histogram library(ggplot2) ggplot(rr.mar, aes(x=mph,fill=Gender)) + geom_histogram(binwidth=0.1) ### Example 2.11 rm(list=ls(all=TRUE)) sw1 <- read.csv("http://www.stat.ufl.edu/~winner/data/software1.csv") attach(sw1); names(sw1) cor(sizeProj,effortProj) cor(sizeProj[1:16],effortProj[1:16]) par(mfrow=c(2,2)) plot(sizeProj,effortProj,xlab="size",ylab="effort", main="Original Scale, All Projects") abline(lm(effortProj~sizeProj)) plot(sizeProj[1:16],effortProj[1:16],xlab="size",ylab="effort", main="Original Scale, Project 17 Removed") abline(lm(effortProj[1:16]~sizeProj[1:16])) cor(log(sizeProj),log(effortProj)) cor(log(sizeProj[1:16]),log(effortProj[1:16])) plot(log(sizeProj),log(effortProj),xlab="ln(size)",ylab="ln(effort)", main="Log Scale, All Projects") abline(lm(log(effortProj) ~ log(sizeProj))) plot(log(sizeProj[1:16]),log(effortProj[1:16]),xlab="ln(size)",ylab="ln(effort)", main="Log Scale, Project 17 Removed") abline(lm(log(effortProj[1:16]) ~ log(sizeProj[1:16]))) ### Example 2.12 rm(list=ls(all=TRUE)) rs1 <- read.csv("http://www.stat.ufl.edu/~winner/data/rockstrength.csv") attach(rs1); names(rs1) ## Obtain Scatterplot matrix of UCS, hb, gs, ga (columns 2,6,7,8 of rs1) plot(rs1[,c(2,6,7,8)]) ## Obtain correlation matrix of UCS, hb, gs, ga (columns 2,6,7,8 of rs1) cor(rs1[,c(2,6,7,8)]) ### Example 2.13 rm(list=ls(all=TRUE)) nasRace <- read.fwf("http://www.stat.ufl.edu/~winner/data/nascarr.dat", widths=c(3,6,4,4,9,7,9,9,7,5,3,4,4,9,7,8,5,38),col.names=c("seriesRace", "year","yearRace","numCar","payout","cpiU","spearman","kendall","trkLength", "lapsComp","roadRace","cautionFlag","leadChange","winTime","trkLat","trkLong", "trkCode","trkName")) attach(nasRace) length(spearman) summary(spearman) hist(spearman,breaks=seq(-.5,1,.02),prob=T, main="NASCAR Start/Finish - Spearman's rho") lines(density(spearman))