### Chapter 2

### Example 2.1

## Read data off web page, attach file as data frame, and list variable names
clt2016 <- read.csv("http://www.stat.ufl.edu/~winner/data/trafficstop.csv")
attach(clt2016); names(clt2016)

head(clt2016)    ## Print first 6 observations 

## Assign labels to the Catgories of Reasons for Stop
labels.RsnStop <- c("ChkPnt","DWI","Invstgtn","Other",
   "SafeMove","SeatBelt","Speed","StopLgtSgn","VhclMove","Rgstrtn")

## Obtain and print frequency table for Reasons for Stop
(table.RsnStop <- table(RsnStop))

## Pie chart based on Table and Labels from above
pie(table.RsnStop, labels.RsnStop, main="Pie Chart - CLT Traffic Stops")

## Bar chart based on Table and Labels from above (cex shrinks size of levels)
barplot(table.RsnStop, names.arg=labels.RsnStop, 
   main="Bar Chart - CLT Traffic Stops", xlab="Reason", ylab="Frequency",
   cex.names=0.6)

### Example 2.2

rm(list=ls(all=TRUE))


### Read data and set up data frame
nhl <- read.csv("http://www.stat.ufl.edu/~winner/data/nhl_ht_wt.csv")
attach(nhl); names(nhl)

### Generate random values (-0.5 to 0.5) to add to Height
set.seed(1234)
N <- NROW(nhl)
Height.dev <- 0.5 - runif(N)
Height <- Height + Height.dev

### Compute BMI
bmi.nhl <- 703 * Weight / (Height^2)   

### Obtain histogram
hist(bmi.nhl, breaks=30, xlab="Body Mass Index", 
main="NHL BMI Distribution 2013-2014 Season")


### Examples 2.3 and 2.5 (Rock and Roll Marathon)

rm(list=ls(all=TRUE))


## Read data from website and attach data frame and obain variable names
rr.mar <- read.csv(
"http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv")
attach(rr.mar); names(rr.mar)

## Obtain mean and standard deviation by gender
tapply(mph,Gender,mean)
tapply(mph,Gender,median)

tapply(mph,Gender,var)
tapply(mph,Gender,sd)

## Obtain the densities (for plotting) of mph by gender
d.F <- density(mph[Gender=="F"])
d.M <- density(mph[Gender=="M"])

## Set up a 2x2 grid for plots 
par(mfrow=c(2,2))
## Histograms for Female and Male mph
hist(mph[Gender=="F"],breaks=25,main="Histogram of Female Speeds",
   xlab="Female Speeds")
hist(mph[Gender=="M"],breaks=25,main="Histogram of Male Speeds",
   xlab="Male Speeds")
## Density Plots for Female and Male mph
plot(d.F,
  main="Kernel Density Plot of Female Speeds")
plot(d.M,
  main="Kernel Density Plot of Male Speeds")

## Reset Plot to 1 per page and obtain side-by-side boxplots
## Gender is a factor vatiable (on the x-axis)
par(mfrow=c(1,1))
plot(Gender, mph, main="Box Plots of Speed(mph) by Gender")

## Obtain a "violin plot" - a "smoothed density" version of boxplot
install.packages("ggplot2")
require(ggplot2)
ggplot(rr.mar, aes(y=mph, x=Gender)) + geom_violin()

### Miami Weather Plots

rm(list=ls(all=TRUE))

## Read data and set up data frame
mw1 <- read.csv("http://www.stat.ufl.edu/~winner/data/miami_weather.csv")
attach(mw1); names(mw1)
 
## Obtain mean temperature by year
(yearMeanTemp <- aggregate(meantemp ~ year, mw1, mean))

## Stack Monthly and Annual plots 
par(mfrow=c(2,1))

## Monthly Plot gives only "y", not "x", this is a line plot
## type="l" draws lines meeting points
plot(meantemp, type="l", main="Miami Monthly Mean Temp (F) 1949-2014",
  xlab="Month", ylab="Mean Temperature")

## Plot "x"=Year (first column of yearMeanTemp) and
##  "y"=mean temp (second column of yearMeanTemp0
plot(yearMeanTemp[,1], yearMeanTemp[,2], 
  type="l", main="Miami Yearly Mean Temp (F) 1949-2014",
  xlab="Year", ylab="Mean Temperature")

### Bigfoot Map

rm(list=ls(all=TRUE))

bigfoot <- read.csv("http://www.stat.ufl.edu/~winner/data/bigfoot_state.csv",
                    header=TRUE)
attach(bigfoot); names(bigfoot)

install.packages("fiftystater")
library(ggplot2)
library(fiftystater)

data("fifty_states") # this line is optional due to lazy data loading

# map_id creates the aesthetic mapping to the state name column in your data
p <- ggplot(bigfoot, aes(map_id = tolower(State))) + 
  # map points to the fifty_states shape data
  geom_map(aes(fill = Bigfoot), map = fifty_states) + 
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() + labs(title="Bigfoot Sightings") +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank())

p
# add border boxes to AK/HI
p + fifty_states_inset_boxes() 

## Bivariate pplot
library(colorplaner)
p + aes(fill2 = Pop.m) + scale_fill_colorplane() +
  theme(legend.position = "right")

### Example 2.4

rm(list=ls(all=TRUE))

## Read data off web page, attach file as data frame, and list variable names
clt2016 <- read.csv("http://www.stat.ufl.edu/~winner/data/trafficstop.csv")
attach(clt2016); names(clt2016)

(table.RsnStop <- table(RsnStop))
round(table.RsnStop / sum(table.RsnStop), 5)
round(cbind(propRsnStop),5)

### Example 2.5 (NHL BMI portion)

rm(list=ls(all=TRUE))

### Read data and set up data frame
nhl <- read.csv("http://www.stat.ufl.edu/~winner/data/nhl_ht_wt.csv")
attach(nhl); names(nhl)

### Generate random values (-0.5 to 0.5) to add to Height
set.seed(1234)
N <- NROW(nhl)
Height.dev <- 0.5 - runif(N)
Height <- Height + Height.dev

### Compute BMI
bmi.nhl <- 703 * Weight / (Height^2)   

### obtain the population size from number of rows of data frame
(N <- NROW(nhl))
### obtain the total of the BMI values
(sum.BMI <- sum(bmi.nhl))
### mean = sum / N
sum.BMI/N
### Use built-in mean function
mean(bmi.nhl)

### Obtain sorted bmi's 
bmi.nhl.sort <- sort(bmi.nhl)
### Print first few cases to confirm ordered
head(bmi.nhl.sort)
### If N is even, average middle 2 cases, otherwise take middle case
ifelse(N%%2==0,(bmi.nhl.sort[N/2]+bmi.nhl.sort[N/2+1])/2,
      bmi.nhl.sort[(N+1)/2])
### Use built-in median function
median(bmi.nhl)


### Examples 2.6-2.8 (NHL BMI portion)

rm(list=ls(all=TRUE))

### Read data and set up data frame
nhl <- read.csv("http://www.stat.ufl.edu/~winner/data/nhl_ht_wt.csv")
attach(nhl); names(nhl)

### Generate random values (-0.5 to 0.5) to add to Height
set.seed(1234)
N <- NROW(nhl)
Height.dev <- 0.5 - runif(N)
Height <- Height + Height.dev

### Compute BMI
bmi.nhl <- 703 * Weight / (Height^2)   

(bmi.max <- max(bmi.nhl))    # Highest BMI
(bmi.min <- min(bmi.nhl))    # Lowest BMI
(range <- bmi.max - bmi.min) # Compute Range
(bmi.75 <- quantile(bmi.nhl,.75))    # BMI 75%-ile
(bmi.25 <- quantile(bmi.nhl,.25))    # BMI 25%-ile
(IQR <-  bmi.75 - bmi.25)    # Comute IQR
(N <- length(bmi.nhl))     # Use "length" function to get N
(mu <- mean(bmi.nhl))      # Use "mean" function to get mu
(sum.dev2 <- sum((bmi.nhl - mu)^2)) # Numerator of Variance
(sigma2 <- sum.dev2/N)     # Population Variance
(s2 <- sum.dev2/(N-1))     # Sample Variance
(sigma <- sqrt(sigma2))    # Population Standard Deviation
(s <- sqrt(s2))            # Sample Standard Deviation
var(bmi.nhl)               # Sample Variance with "var" function
(N-1)*var(bmi.nhl)/N       # Pop variance with "var" function
sd(bmi.nhl)                # Sample Std Dev with "sd" function
sqrt((N-1)/N)*sd(bmi.nhl)  # Population Std Dev with "sd" function
## Proportion of Individual w/in 1 and 2 SDs of mean
sum(bmi.nhl >= mu-sigma & bmi.nhl <= mu+sigma) / N
sum(bmi.nhl >= mu-2*sigma & bmi.nhl <= mu+2*sigma) / N
(mad <- median(abs(bmi.nhl - mu))) # Median absolute deviation
mad/0.645                          # Approximating sigma
(cv <- sigma/mu)                   # Coefficient of Variation

(mu3 <-  (sum((bmi.nhl-mu)^3)/N))
(skew <- mu3/(sigma^3))

(mu4 <-  (sum((bmi.nhl-mu)^4)/N))
(kurt <- mu4/(sigma^4)-3)



### Examples 2.6-2.8 (Rock and Roll Marathon portion)

rm(list=ls(all=TRUE))

## Read data from website and attach data frame and obain variable names
rr.mar <- read.csv(
"http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv")
attach(rr.mar); names(rr.mar)

f.mph <- mph[Gender=="F"]
(N.f <- length(f.mph))
(mean.f <- mean(f.mph))
(var.f <- var(f.mph))
(sd.f <- sd(f.mph))
sum(f.mph >= mean.f - sd.f & f.mph <= mean.f + sd.f) / N.f
sum(f.mph >= mean.f - 2*sd.f & f.mph <= mean.f + 2*sd.f) / N.f
(cv.f <- sd.f/mean.f)
(mad.f <- median(abs(f.mph-mean.f)))
(m3.f <- sum((f.mph-mean.f)^3)/N.f)
(skew.f <- m3.f / sd.f^3)
(m4.f <- sum((f.mph-mean.f)^4)/N.f)
(kurt.f <- (m4.f/sd.f^4)-3)

m.mph <- mph[Gender=="M"]
(N.m <- length(m.mph))
(mean.m <- mean(m.mph))
(var.m <- var(m.mph))
(sd.m <- sd(m.mph))
sum(m.mph >= mean.m - sd.m & m.mph <= mean.m + sd.m) / N.m
sum(m.mph >= mean.m - 2*sd.m & m.mph <= mean.m + 2*sd.m) / N.m
(cv.m <- sd.m/mean.m)
(mad.m <- median(abs(m.mph-mean.m)))
(m3.m <- sum((m.mph-mean.m)^3)/N.m)
(skew.m <- m3.m / sd.m^3)
(m4.m <- sum((m.mph-mean.m)^4)/N.m)
(kurt.m <- (m4.m/sd.m^4)-3)


### Example 2.9

rm(list=ls(all=TRUE))

## Read data off web page, attach file as data frame, and list variable names
clt2016 <- read.csv("http://www.stat.ufl.edu/~winner/data/trafficstop.csv")
attach(clt2016); names(clt2016)

head(clt2016)    ## Print first 6 observations 

## Make OffMale and DrvMale "factor variables" and give labels for 0/1
OffMale <- factor(OffMale, labels=c("OfficerF","OfficerM"))
DrvMale <- factor(DrvMale, labels=c("DriverF","DriverM"))

## Obtain Table of Counts (Row=Officer, Column=Driver)
(omdm <- table(OffMale,DrvMale))
## Obtain Row (1) and Column (2) Marginal Totals
margin.table(omdm,1)
margin.table(omdm,2)
## Obtain Proportions across all Cells
omdm/sum(omdm)
## Obtain Row Proportions (Driver Genders w/in Officer Gender)
prop.table(omdm,1)
## Obtain Column Proportions (Officer Genders w/in Driver Gender)
prop.table(omdm,2)

## Obtain Cluster (Grouped) and Stacked Bar Plots
## t(prop.table(omdm,1)) takes transpose so that group var is Officer 
par(mfrow=c(1,2))
barplot(t(prop.table(omdm,1)),beside=T,legend=colnames(omdm),ylim=c(0,1),
   main="Grouped Bar Plot - CLT Traffic Stops")
barplot(t(prop.table(omdm,1)),beside=F,legend=colnames(omdm),
   main="Stacked Bar Plot - CLT Traffic Stops")

install.packages("vcd") 
library(vcd)
table(RsnStop,OffMale,DrvMale, data=clt2016)
mosaic(~RsnStop+OffMale+DrvMale, data=clt2016, shade=TRUE, legend=TRUE)


### Example 2.10

rm(list=ls(all=TRUE))

## Read data from website and attach data frame and obain variable names
rr.mar <- read.csv(
"http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv")
attach(rr.mar); names(rr.mar)

## Obtain the densities (for plotting) of mph by gender
d.F <- density(mph[Gender=="F"])
d.M <- density(mph[Gender=="M"])

## Density Plots for Female and Male mph
plot(d.F,xlim=c(4,11),xlab="Speed(mph)",ylab="Density",
  main="Kernel Density Plot of Marathon Speeds")
lines(d.M,lty=2)
legend(9,0.45,c("Females","Males"),lty=c(1,2))

## Combined histogram
library(ggplot2)
ggplot(rr.mar, aes(x=mph,fill=Gender)) +
   geom_histogram(binwidth=0.1)

### Example 2.11

rm(list=ls(all=TRUE))

sw1 <- read.csv("http://www.stat.ufl.edu/~winner/data/software1.csv")
attach(sw1); names(sw1)

cor(sizeProj,effortProj)
cor(sizeProj[1:16],effortProj[1:16])

par(mfrow=c(2,2))
plot(sizeProj,effortProj,xlab="size",ylab="effort",
   main="Original Scale, All Projects")
abline(lm(effortProj~sizeProj))
plot(sizeProj[1:16],effortProj[1:16],xlab="size",ylab="effort",
   main="Original Scale, Project 17 Removed")
abline(lm(effortProj[1:16]~sizeProj[1:16]))

cor(log(sizeProj),log(effortProj))
cor(log(sizeProj[1:16]),log(effortProj[1:16]))

plot(log(sizeProj),log(effortProj),xlab="ln(size)",ylab="ln(effort)",
   main="Log Scale, All Projects")
abline(lm(log(effortProj) ~ log(sizeProj)))
plot(log(sizeProj[1:16]),log(effortProj[1:16]),xlab="ln(size)",ylab="ln(effort)",
   main="Log Scale, Project 17 Removed")
abline(lm(log(effortProj[1:16]) ~ log(sizeProj[1:16])))


### Example 2.12

rm(list=ls(all=TRUE))

rs1 <- read.csv("http://www.stat.ufl.edu/~winner/data/rockstrength.csv")
attach(rs1); names(rs1)

## Obtain Scatterplot matrix of UCS, hb, gs, ga (columns 2,6,7,8 of rs1)
plot(rs1[,c(2,6,7,8)])

## Obtain correlation matrix of UCS, hb, gs, ga  (columns 2,6,7,8 of rs1)
cor(rs1[,c(2,6,7,8)])

### Example 2.13

rm(list=ls(all=TRUE))

nasRace <- read.fwf("http://www.stat.ufl.edu/~winner/data/nascarr.dat",
  widths=c(3,6,4,4,9,7,9,9,7,5,3,4,4,9,7,8,5,38),col.names=c("seriesRace",
  "year","yearRace","numCar","payout","cpiU","spearman","kendall","trkLength",
  "lapsComp","roadRace","cautionFlag","leadChange","winTime","trkLat","trkLong",
  "trkCode","trkName"))
attach(nasRace)

length(spearman)
summary(spearman)

hist(spearman,breaks=seq(-.5,1,.02),prob=T,
main="NASCAR Start/Finish - Spearman's rho")
lines(density(spearman))