## Chapter 1 ######## Section 1.1 ## Discrete Probability Distribution ## WNBA Home Team versus Point Spread - 2010-2018 Regular Season Games ## y = -1 (Loss), 0 (Tie "Push"), 1 (Win) ## Population of N = 1833 games w/ 917 Losses, 19 Ties, 897 Wins ## => p(-1) = 917/1833 = 0.50027, p(0) = 0.01037, p(1) = 0.48936 y <- c(-1, 0, 1) ## Bet Outcomes (Lose, Tie, Win) num.y <- c(917, 19, 897) ## Outcome Frequencies p.y <- num.y / sum(num.y) ## Outcome Probabilities: p(y) = P(Y=y) F.y <- cumsum(p.y) ## Cumulative Prob. Dist.: F(y) = P(Y<=y) ## Print out y, p(y), F(y) prob.y.out <- cbind(y, p.y, F.y) colnames(prob.y.out) <- c("y", "p(y)", "F(y)=P(Y<=y)") round(prob.y.out, 5) ## Barplot of the probability Distribution barplot(p.y, names=as.character(y), main="WNBA Home Bet Result", col = c("red", "yellow", "green")) ## Compute Expectation (Mean), Variance and Standard Deviation for Y (E.Y <- sum(y * p.y)) ## E{Y} = mu_Y (V.Y <- sum((y - E.Y)^2 * p.y)) ## V{Y} = E{(Y-mu_Y)^2} = sigma_Y^2 (sigma.Y <- sqrt(V.Y)) ## sigma_Y ## Suppose W = 100*Y - 10 where W is winnings on a Bet of 100 w/ fee of 10 a <- -10; b <- 100 ## Give constants a,b for W=a+bY w <- a + b*y ## Compute w for each outcome y (E.W1 <- sum(w * p.y)) ## E{W} by direct computation (V.W1 <- sum((w-E.W1)^2 * p.y)) ## V{W} " " " (sigma.W1 <- sqrt(V.W1)) ## sigma_W " " (E.W2 <- a + b * E.Y) ## E{W} as linear function of Y (V.W2 <- b^2 * V.Y) ## V{W} " " " " " (sigma.W2 <- abs(b) * sigma.Y) ## sigma_W " " " " ## Binomial Distribution ## n independent Bernoulli trials, each with 2 possibles outcomes: S or F ## Probability of "Success": P(S)=p, Probability of "Failure": P(F)=1-p ## Y = number of Successes in the n independent trials y=0,1,...,n ## Number of ways n trials can yield y S's: n "choose" y = n!/[y!(n-y)!] ## Probability of each outcome with y S's: p^y * (1-p)^(n-y) ## => P(Y=y) = p(y) = n!/[y!(n-y)!] * p^y * (1-p)^(n-y) y=0,1,...,n ## E{Y} = n*p V{Y} = n*p*(1-p) # Example - Over the course of his NBA career, Shaquille O'Neal # made .5275 (5935/11252) of his Free Throws in Regular Season games # Suppose we observed him take n = 25 Free Thows in games and # observe Y = # of successful attempts. # Under this Model, Y ~ Binomial(n=25, p=0.5275) # Plot the probability distribution of Y n <- 25; p <- .5275 win.graph(height=5.5, width=7.0) barplot(dbinom(0:n, n, p), names = as.character(0:n), xlab="# Successes", ylab="Probability", main="Shaq FTs ~ Bin(n=25,p=.5275)") ## Mean, Variance, Standard Deviation (E.Y <- n*p) (V.Y <- n*p*(1-p)) (sigma.Y <- sqrt(n*p*(1-p))) ## Poisson Distribution ## Number of instances of a particular outcome in some fixed amount ## of time or space. Can be derived as breaking the fixed time/space ## into many sub-units which can contain either 0 or 1 events. ## The limit of the binomial as n gets very large, and p gets very small, ## but n*p remains constant with n*p = lambda. ## P(Y=y) = exp(-lambda) * lambda^y / y! y=0,1,2,.... ## E{Y} = V{Y} = lambda # Example - Premier Soccer (football outside USA) games are officially 90 # minutes long. Consider all German Football League games from the 2013 # season, with Y=Total Goals,which had a mean of 3.16 and variance of 2.94. # Consider a Poisson model with lambda = 3 for Total Goals and consider # the random variable Y = # of goals in a randomly played game # Plot the probability distribution of Y lambda <- 3.0 win.graph(height=5.5, width=7.0) barplot(c(dpois(0:7, lambda), 1-ppois(7,lambda)), names = c(as.character(0:7), "8+"), xlab="# Goals", ylab="Probability", main="GFL Goals/Game ~ Poi(lambda=3)") ## Negative Binomial Distribution ## 2 Common Applications: ## A) Number of Bernoulli trials needed to obtain r Successes ## B) Count data where the mean and variance are not equal which ## arises from a mixture of Poissons (with varying lambdas) ## For Case B), Notation is: Y ~ NB(mu, alpha) where: ## P(Y=y) = p(y) = ## [Gamma((1/alpha)+y)/(Gamma(1/alpha)*Gamma(y+1))] * ## ((1/alpha)/((1/alpha)+mu))^(1/alpha)* (mu/((1/alpha)+mu))^y y=0,1,... ## Gamma(w) = integral_0^Inf x^(w-1) e^(-x) dx ## E{Y} = mu V{Y} = mu(1 + alpha*mu) # Example - Lead Changes in NASCAR races on longer (>=1.5 mile) trks 1975-2003 # N = 452 races: mu = 23.5, sigma^2 = 138.0 # Using mu = 23.5, alpha = 0.207 as parameters of Negative Binomial Dist. # Note that R parameterizes the model as case A), not case B). # In functions that model B), often software produces estimates of 1/alpha. mu <- 23.5; alpha <- 0.207 negbin.c <- function(y, mu, alpha) { (gamma((1/alpha)+y)/(gamma(1/alpha)*gamma(y+1))) * ((1/alpha)/((1/alpha)+mu))^(1/alpha)* (mu/((1/alpha)+mu))^y } win.graph(height=5.5, width=7.0) plot(0:80, negbin.c(0:80, mu, alpha), type="s", xlab="y", ylab="y", main = "NASCAR Lead Changes ~ NB(mu=23.5, alpha=0.207)") ## Normal (aka Gaussian) Distribution ## Model for continuous data, that have a mond-shape - symmetric with ## measurements tending to fall close around the mean, with fewer ## measurements in the tails ## Prob. Density Function (pdf): ## f(y) = (1/sqrt(2*pi*sigma^2)) * exp((-1/2) * ((y - mu)^2 / sigma^2)) ## -Inf < mu < Inf, sigma > 0, -Inf < y < Inf ## P(mu-2*sigma <= Y <= mu+2*sigma) = .9544 ## P(mu-sigma <= Y <= mu+sigma) = .6826 ## E{Y} = mu, V{Y} = sigma^2 ## Example - Body Mass Indices among Professional Athletes are approx. Normal ## Based on the 2013/2014 seasons (N_NHL=717, N_EPL=526): ## Among National Hockey League (NHL) players, BMI ~ N(mu=26.50, sigma=1.46) ## Among English Premier League (EPL) players, BMI ~ N(mu=23.02, sigma=1.71) mu.NHL <- 26.50; sigma.NHL <- 1.46 mu.EPL <- 23.02; sigma.EPL <- 1.71 y <- seq(18,32, length=1000) win.graph(width=7.0, height=5.5) plot(y, dnorm(y,mu.NHL, sigma.NHL), type="l", col="red", lwd=3, xlab="y", ylab="f(y)", main="Body Mass Index Densities") lines(y, dnorm(y,mu.EPL, sigma.EPL), col="green", lwd=3) legend(18,.22,c("NHL","EPL"),lty=c(1,1),col=c("red","green")) ## Gamma Distibution ## Used to model positive, continuous random variables that tend to be ## skewed to the right. Model based on shape and rate parameters: ## Prob. Density function: ## f(y) = (1/(Gamma(alpha)*beta^alpha)) * y^(alpha-1) * exp(-y*beta) ## y > 0, alpha, beta > 0 ## E{Y} = alpha/beta, V{Y} = alpha/(beta^2), Mode=(alpha-1)/beta (alpha>1) # Example - Velocities of marathon participants at 2015 Rock&Roll Marathon # Males: N=1454: mean=6.337, sigma=1.058 => alpha=35.896, beta=5.665 # Females: N=1045: mean=5.840, sigma = 0.831 => alpha=49.831, beta=8.533 alpha.M <- 35.896; beta.M <- 5.665 alpha.F <- 49.831; beta.F <- 8.533 y <- seq(3, 11, length=1000) y.max <- 1.1 * max(dgamma(y,alpha.M,beta.M), dgamma(y,alpha.F,beta.F)) win.graph(width=7.0, height=5.5) plot(y, dgamma(y, alpha.M, beta.M), type="l", col="red", lwd=3, xlab="y", ylab="f(y)", main="Marathon Miles per Hour", ylim=c(0,y.max)) lines(y, dgamma(y,alpha.F, beta.F), col="green", lwd=3) legend(9,.40,c("Males","Females"),lty=c(1,1),col=c("red","green")) ## Beta Distribution ## Used to model continuous distributions on a finite range [a,b], ## which is often [0,1] if Y represents proportions or probability ## Prob. Density Function: ## (Gamma(alpha+beta)/(Gamma(alpha)*Gamma(beta))) * ## ((y-a)/(b-a))^(alpha-1) * ((b-y)/(b-a))^(beta-1) ## a <= y <= b, alpha, beta > 0 ## E{Y} = a + (alpha/(alpha+beta))*(b-a) ## V{Y} = (b-a)^2 * (alpha*beta)/((alpha+beta)^2*(alpha+beta+1)) ## Mode = a + ((alpha-1)/(alpha+beta-2))*(b-a) if alpha, beta > 1 # Example - NBA Team 3-Point Success Proportions by game - 2016/7 season # Y = Proportion of 3-Point shots made by team per game (a=0, b=1) # mu = .3566, sigma = .0946 => alpha = 8.78, beta = 15.83 alpha <- 8.78; beta <- 15.83 y <- seq(0, 1, length=1000) win.graph(width=7.0, height=5.5) plot(y, dbeta(y, alpha, beta), type="l", lwd=3, col="purple", xlab="y", ylab="f(y)", main="NBA Team 3-Point Proportions") ######## Section 1.2 ## Bivariate Normal Distribution (scalar version) ## f(x,y) = [1 / (2*pi*sqrt(sigmaX^2*sigmaY^2*(1-rho^2)))] * ## exp{(-1/(2*(1-rho^2))) * ## [((x-muX)^2/(sigmaX^2)) - (2*rho*(x-muX)*(y-muY)/(sigmaX*sigmaY)) + ## ((y-muY)^2/(sigmaY^2))]} ## E{X}=muX, E{Y}=muY, V{X}=sigmaX^2, V{Y}=sigmaY^2, ## COV{X,Y}=sigmaXY, CORR{X,Y}=rho=sigmaXY/(sigmaX*sigmaY) ## E{Y|X=x} = muY + (sigmaXY/sigmaX^2)*(x-muX), V{Y|X=x}=sigmaY^2(1-rho^2) ## Linear functions: W = aX + bY ## => W~N(a*muX + b*muY, a^2sigmaX^2 + b^2sigmaY^2 + 2*a*b*sigmaXY) # Example - NBA Playoffs Over/Under and Total Points, 1991-2018 # Oddsmakers set an Over/Under line for sports matches and bettors can # place bets as Over (Teams' combined points exceed the OU line), Under # Let X=OU and Y=Total Points. Based on all NBA playoff games for 1991-2018 # (X,Y) is well approximated as Bivariate Normal with: # muX=193.38, sigmaX=13.49, muY=193.05, sigmaY=21.72, sigmaXY=179.85, rho=.61 # Plot bivariate density - Brute force muX <- 193.38; sigmaX <- 13.49 muY <- 193.05; sigmaY <- 21.72 sigmaXY <- 179.85 rho <- sigmaXY / (sigmaX * sigmaY) f.xy1 <- function(x, y) { (1 / (2*pi*sqrt(sigmaX^2*sigmaY^2*(1-rho^2)))) * exp((-1/(2*(1-rho^2))) * (((x-muX)^2/(sigmaX^2)) - (2*rho*(x-muX)*(y-muY)/(sigmaX*sigmaY)) + ((y-muY)^2/(sigmaY^2)))) } x <- seq(150,250, length=100) y <- seq(120,280, length=100) z <- outer(x,y,f.xy1) ## z contains 100x100 matrix for f(x,y) win.graph(height=5.5, width=7.0) persp(x,y,z, theta=45, phi=45, col="purple") win.graph(height=5.5, width=7.0) contour(x,y,z, col="red") # Plot bivariate density using mvtnorm package to get the density library(mvtnorm) mu <- c(muX,muY) Sigma <- matrix(c(sigmaX^2, sigmaXY, sigmaXY, sigmaY^2), ncol=2) x <- seq(150,250, length=100) y <- seq(120,280, length=100) f.xy2 <- function(x,y) {dmvnorm(cbind(x,y), mean=mu, sigma=Sigma)} z <- outer(x,y,f.xy2) win.graph(height=5.5, width=7.0) persp(x,y,z, theta=45, phi=45, col="green") win.graph(height=5.5, width=7.0) contour(x,y,z, col="blue") # Plot Unconditional distribution of Total Points (Y) for various # levels of Over/Under (X) - Levels = 165 to 225 by 15 x.cond <- seq(165, 225, by=15) muY.cond <- muY + (sigmaXY / sigmaX^2) * (x.cond - muX) sigmaY.cond <- sigmaY * sqrt(1 - rho^2) cbind(x.cond, muY.cond, sigmaY.cond) ymax <- 1.1 * max(dnorm(x.cond[1], muY.cond[1], sigmaY.cond)) win.graph(height=5.5, width=7.0) plot(y, dnorm(y, muY, sigmaY), type="l", ylim = c(0, ymax), lwd=2, xlab="Total Points (y)", ylab="f(y)", main="Normal Density for Total Points") for (i in 1:length(x.cond)) { lines(y, dnorm(y, muY.cond[i], sigmaY.cond), col=i+1, lwd=2) } legend(250,.024, c("Unconditional","OU=165","OU=180","OU=195","OU=210","OU=225"), lty = rep(1,(1+length(x.cond))), col = 1:(1+length(x.cond))) # Let W = difference between Total score and Over/Under: W = aX + bY for a=-1,b=1 a <- -1; b <- 1 (muW <- a * muX + b * muY) (varW <- a^2 * sigmaX^2 + b^2 * sigmaY^2 + 2 * a * b * sigmaXY) (sigmaW <- sqrt(varW)) w <- seq(-50, 50, length=300) win.graph(height=5.5, width=7.0) plot(w, dnorm(w, muW, sigmaW), type="l", lwd=2, xlab="Total Points-Over/Under (w)", ylab="f(w)", main="Normal Density for Total Points - Over/Under") ######## Section 1.3 ## Chi-square distribution - Useful for making inference on variances of normals ## f(y) = (1/(Gamma(nu/2)*2^(nu/2))) * y^((nu/2)-1) * exp{-2*y} ## y > 0, nu > 0 nu = "degrees of freedom" E{Y}=nu, V{Y}=2*nu ## ## Y1,...,Yn ~ N(mu, sigma^2) => (n-1)S^2/sigma^2 ~ Chi-square(n-1) # Plots of some Chi-square densities nu <- c(5, 15, 40) y <- seq(.0001, 100, length=1000) win.graph(height=5.5, width=7.0) plot(y, dchisq(y, nu[1]), type="l", lwd=2, xlab="y", ylab="f(y)", main="Chi-square Distributions", ylim=c(0,.20)) lines(y, dchisq(y, nu[2]), lwd=2, lty=2, col="blue") lines(y, dchisq(y, nu[3]), lwd=2, lty=5, col="green") legend(60,.18, c("Chi-square(5)", "Chi-square(15)", "Chi-square(40)"), lty=c(1,2,5), lwd=rep(2,3), col=c(1,"blue","green")) ## t-distribution - Useful for making inference on mean for normals when variance ## is unknown ## f(t) = [Gamma((nu+1)/2)/(Gamma(nu/2)*sqrt(nu*pi))] * (1+t^2/nu)^(-((nu_1)/2)) ## -Inf < t < Inf, nu > 0, E{T}=0, V{T}=nu/(nu-2) for nu>2 ## ## Y_1,...,Y_n ~ N(mu, sigma^2) => T = sqrt(n)*(Ybar-mu)/S ~ t(n-1) ## Y_i1,...,Y_ini ~ N(mu_i, sigma^2) i=1,2 ## with S_p^2 = [(n1-1)*S1^2+(n2-1)*S2^2]/(n1+n2-2) ## T = [(Ybar1-Ybar2)-(mu1-mu2)]/sqrt(S_p^2((1/n1)+(1/n2))) ~ t(n1+n2-2) # Plots of some t densities and the standard normal density nu <- c(3,12,25) t <- seq(-4,4,length=1000) win.graph(height=5.5, width=7.0) plot(t, dnorm(t,0,1), type="l", lwd=2, xlab="t", ylab="f(t)", main="Z and t Distributions", ylim=c(0,.50)) for (i in 1:length(nu)) { lines(t, dt(t, nu[i]), lty=(i+1), lwd=2, col=(i+2)) } legend(2,.4, c("Z=N(0,1)","t(35)", "t(12)", "t(25)"), lty=c(1,2,3,4), lwd=rep(2,4), col=c(1,3,4,5)) ## F-distribution - Useful for making inference on ratios of variances of normals ## f(f)=[(Gamma((nu1+nu2)/2)*nu1^(nu1/2)*nu2^(nu2/2))/(Gamma(nu1/2)*Gamma(nu2/2))]* ## [f^(nu1/2-1)/((nu1*f+nu2)^((nu1+nu2)/2))] f>0, nu1,nu2>0 ## E{F} = nu1/(nu2-2) for nu2>2, ## V{F}=2*nu2^2*(nu1+nu2-2)/(nu2*(nu2-2)*(nu2-4)) for nu2>4 ## ## Y_i1,...,Y_ini ~ N(mu_i, sigma_i^2) i=1,2 => ## Wi = (ni-1)S_i^2/sigma_i^2 ~ Chi-square(nui) i=1,2 with nui=ni-1 ## F = (W1/nu1)/(W2/nu2) = (S_1^2/sigma_1^2)/(S_2^2/sigma_2^2) ~ F(nu1,nu2) ## Note that when sigma_1^2=sigma_2^2, S_1^2/S_2^2 ~ F(nu1,nu2) # Plots of 3 F-distributions nu1 <- c(3,2,6); nu2 <- c(12,24,30) f <- seq(0,6, length=1000) win.graph(height=5.5, width=7.0) plot(f, df(f, nu1[1], nu2[1]), type="l", lwd=2, xlab="f", ylab="f(f)", main="F Distributions", ylim=c(0,1.0)) lines(f, df(f, nu1[2], nu2[2]), lwd=2, lty=2, col="blue") lines(f, df(f, nu1[3], nu2[3]), lwd=2, lty=5, col="green") legend(4, .7, c("F(3,12)", "F(2,24)", "F(6,30)"), lty=c(1,2,5), lwd=rep(2,3), col=c(1,"blue","green")) ## Inferences regarding a mean and variance ## Let Y1,...,Yn ~ iid N(mu, sigma^2) with n=25, mu=100, sigma=15 (e.g. IQs) ## Sample mean and variance are computed, to obtain (1-alpha)100% CIs for mu,sigma ## T ~ t_nu => P(T >= t_nu(a)) = a, W ~ X2_nu => P(W >= X2_nu(a)) = a ## ## (1-alpha)100% CI for mu: ybar +/- t_(n-1)(alpha/2)*(s/sqrt(n)) ## (1-alpha)100% CI for sigma^2: ## [(n-1)*s^2/X2_(n-1)(alpha/2) , (n-1)*s^2/X2_(n-1)(1-alpha/2)] ## (1-alpha)100% CI for sigma: +sqrt of endpoints for sigma^2 # Take 100000 random samples of size n=25 from N(mu=100,sigma^2=15^2) and # observe coverage rates of CIs for mu, sigma set.seed(12345) num.sim <- 100000 n <- 25 mu <- 100 sigma <- 15 ybar <- numeric(num.sim) s <- numeric(num.sim) for (i in 1:num.sim) { iq.samp <- rnorm(n, mu, sigma) ybar[i] <- mean(iq.samp) s[i] <- sd(iq.samp) } alpha <- c(.10, .05, .01) t.crit <- qt(1-alpha/2, n-1) X2.lo.crit <- qchisq(alpha/2, n-1) X2.hi.crit <- qchisq(1-alpha/2, n-1) mean.cov <- numeric(length(alpha)) sigma.cov <- numeric(length(alpha)) for (i in 1:3) { mean.LB <- ybar - t.crit[i] * s / sqrt(n) mean.UB <- ybar + t.crit[i] * s / sqrt(n) mean.cov[i] <- sum((mean.LB <= mu) & (mean.UB >= mu)) / num.sim sigma.LB <- sqrt((n-1) * s^2 / X2.hi.crit[i]) sigma.UB <- sqrt((n-1) * s^2 / X2.lo.crit[i]) sigma.cov[i] <- sum((sigma.LB <= sigma) & (sigma.UB >= sigma)) / num.sim } cov.out <- cbind(mean.cov, sigma.cov) colnames(cov.out) <- c("Mean Cover", "SD Cover") rownames(cov.out) <- c("90% CI", "95% CI", "99% CI") round(cov.out, 4) mvsd.out <- cbind(rbind(mean(ybar),var(ybar),sd(ybar)), rbind(mean(s^2), var(s^2), sd(s^2)), rbind(mu, sigma^2/n, sigma/sqrt(n)), rbind(sigma^2, 2*sigma^4/(n-1), sqrt(2*sigma^4/(n-1)))) colnames(mvsd.out) <- c("Ybar","S2", "Theory(Ybar)", "Theory(S2)") rownames(mvsd.out) <- c("Mean", "Variance", "Std Dev") round(mvsd.out, 3) ######## Section 1.4 ## Bernoulli Distribution ## WNBA Free Throw Shooting - Maya Moore attempts <- 181; made <- 160 y <- c(rep(1, made), rep(0, attempts-made)) table(y) # Brute-force pihat <- sum(y) / length(y) V.pihat <- pihat * (1-pihat) / length(y) maya1.out <- cbind(pihat, sqrt(V.pihat), pihat-qnorm(.975)*sqrt(V.pihat), pihat+qnorm(.975)*sqrt(V.pihat)) colnames(maya1.out) <- c("Estimate", "Std. Error", "Lower", "Upper") round(maya1.out, 3) # Fit a generalized linear model relating y to the mean with logit link function # Take regression coefficient, b0 and # pihat = e^b0 / (1 + e^b0) => b0 = log(pihat / (1-pihat)) # db0/dpihat = 1 / (pihat * (1-pihat)) # V{pihat} = V{b0}*(db0/dpihat)^2 maya.mod <- glm(y ~ 1, family=binomial("logit")) maya.s <- summary(maya.mod) maya.s b0 <- maya.s$coef[,1] V.b0 <- maya.s$coef[,2]^2 (pihat1 <- exp(b0) / (1 + exp(b0))) dbo_dpihat <- 1 / (pihat1 * (1-pihat1)) (V.pihat1 <- V.b0 / dbo_dpihat^2) ## Poisson Distribution ## EPL Goals per Game - 2012/3 Season goals <- 0:10 goals.freq <- c(35,61,72,91,64,32,13,9,1,0,2) y <- rep(goals, goals.freq) # 0 repeated 35 time, 1 61, ... n.games <- sum(goals.freq) cbind(goals, goals.freq) lambdahat <- sum(y) / n.games V.lambdahat <- lambdahat / n.games epl1.out <- cbind(lambdahat, sqrt(V.lambdahat), lambdahat-qnorm(.975)*sqrt(V.lambdahat), lambdahat+qnorm(.975)*sqrt(V.lambdahat)) colnames(epl1.out) <- c("Estimate", "Std. Error", "Lower", "Upper") round(epl1.out, 3) # Fit a generalized linear model relating y to the mean with logit link function # Take regression coefficient, b0 and # lambdahat = e^b0 => b0 = log(lambdahat) # db0/dlambdahat = 1 / lambdahat # V{lambdahat} = V{b0}*(db0/dpihat)^2 epl.mod <- glm(y ~ 1, family=poisson("log")) epl.s <- summary(epl.mod) epl.s b0 <- epl.s$coef[,1] V.b0 <- epl.s$coef[,2]^2 (lambdahat1 <- exp(b0)) dbo_dlambdahat <- 1 / lambdahat1 (V.lambdahat1 <- V.b0 / dbo_dlambdahat^2) ## Negative Binomial Distribution using glm.nb function (no closed form) ## This estimates b0 = log(mu) and theta = 1/alpha library(MASS) epl.mod1 <- glm.nb(y ~ 1) summary(epl.mod1) epl.s1 <- summary(epl.mod1) epl.s1 b0 <- epl.s1$coef[,1] V.b0 <- epl.s1$coef[,2]^2 alpha.inv <- epl.s1$theta V.alpha.inv <- epl.s1$SE.theta^2 ## Gamma Distribution ## Running speeds for females at Rock and Roll Marathon - 2015 rr.mar <- read.csv("http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv", header=T) attach(rr.mar); names(rr.mar) f.mph <- mph[Gender=="F"] rrf.mod1 <- glm(f.mph ~ 1, family=Gamma(link="log")) rrf.s <- summary(rrf.mod1) rrf.s names(rrf.s) ## Intercept represents log(mu) = log(alpha/beta) ## Dispersion parameter is phi = 1/alpha ## => alpha = 1/phi, beta = 1/(mu*phi) b0 <- rrf.s$coef[1] muhat <- exp(b0) phihat <- rrf.s$dispersion alphahat <- 1 / phihat betahat <- 1 / (muhat*phihat) round(cbind(muhat, alphahat, betahat), 3) f.xmph <- seq(4,11,.01) fg.ymph <- dgamma(f.xmph,49.38,8.46) hist(f.mph,ylim=c(0,120),breaks=seq(4,11,1/6),xlab="MPH") lines(f.xmph,fg.ymph*length(f.mph)/6) rm(list=ls(all=TRUE)) ## Beta Distribution ## NASCAR Prize Proportion won by Fords - 1995 nascar <- read.csv("http://users.stat.ufl.edu/~winner/data/nascard.csv") attach(nascar); names(nascar) carMake95 <- carMake[year == 1995] prize95 <- prize[year == 1995] yearRace95 <- yearRace[year == 1995] numRace95 <- max(yearRace95) racePrize95 <- numeric(numRace95) fordPrize95 <- numeric(numRace95) for (i1 in 1:numRace95) { racePrize95[i1] <- sum(prize95[yearRace95 == i1]) fordPrize95[i1] <- sum(prize95[(yearRace95 == i1)&(carMake95 == "Ford")]) } fordProp95 <- fordPrize95 / racePrize95 mean(fordProp95) sd(fordProp95) # install.packages("betareg") library(betareg) FP.mod1 <- betareg(fordProp95 ~ 1) FP.s <- summary(FP.mod1) FP.s names(FP.s) ## betareg gives estimates for gamma and phi and their standard errors ## Note that phi = e^phi* in course notes (gammahat <- FP.s$coef$mean[[1]]) ## Obtain gammahat = logit(muhat) (phihat <- FP.s$coef$precision[[1]]) ## Obtain phihat (muhat <- exp(gammahat)/(1+exp(gammahat))) (alphahat <- muhat * phihat) (betahat <- (1-muhat) * phihat) (sigma2hat <- (alphahat*betahat) / ((phihat^2)*(phihat+1))) V.gammahat <- FP.s$coef$mean[[2]]^2 ## Obtain V{gammahat} dmuhat_dgammahat <- exp(gammahat) / (1 + exp(gammahat))^2 V.muhat <- V.gammahat * dmuhat_dgammahat^2 ford.out <- cbind(muhat, sqrt(V.muhat), muhat-qnorm(.975)*sqrt(V.muhat), muhat+qnorm(.975)*sqrt(V.muhat)) colnames(ford.out) <- c("Estimate", "Std. Error", "Lower", "Upper") round(ford.out, 4) rm(list=ls(all=TRUE)) ######## Section 1.6 ## Bootstrapping the Coefficient of Variance - Strength of Concrere Beams beams <- read.csv( "http://www.stat.ufl.edu/~winner/data/concretebeams_strength.csv") attach(beams); names(beams) ## Brute-force approach ratio_mod1 <- shr_stng/ss_mod1 mean(ratio_mod1); sd(ratio_mod1) (cv_data <- 100*sd(ratio_mod1)/mean(ratio_mod1)) win.graph(height=5.5, width=7.0) hist(ratio_mod1, breaks=15, main="Ratio of Experimental Strength to Model") N <- 100000 cv1.boot <- numeric(N) for (i in 1:N) { x <- sample(ratio_mod1, 200, replace=T) cv1.boot[i] <- 100*sd(x)/mean(x) } win.graph(height=5.5, width=7.0) hist(cv1.boot, breaks="Scott", main="CV(%) of 100000 Bootstrap Samples") summary(cv1.boot) sd(cv1.boot) quantile(cv1.boot,c(.025,.975)) ## Using the boot package install.packages("boot") library(boot) # Create a function that computes the statistic of interest (i is sample id) cv2 <- function(ratio_mod1, i) 100*sd(ratio_mod1[i])/mean(ratio_mod1[i]) # run boot(data, function, R=#samples) cv2.boot <- boot(ratio_mod1, cv2, R=100000) cv2.boot 100*sd(ratio_mod1)/mean(ratio_mod1) # original mean(cv2.boot$t) # mean of bootstrapped CV's mean(cv2.boot$t) - cv_data # bias calculation sqrt(var(cv2.boot$t)) # std. error of bootstrapped CV's # comparison of original CV and lower and upper cut-offs of bootstrap CV's cv_data - quantile(cv2.boot$t,c(.025, .975)) # function for computing various CI's from the bootstrapped CV's boot.ci(cv2.boot) quantile(cv2.boot$t,c(.025, .975))