pdf("E:\\Rmisc\\Rmisc\\graphs\\sta6207_hw1.pdf")
set.seed(97531)

############### Part 1
n.sim <- c(100,1000,10000,100000)    #### There will be 4 "sample sizes"
mu <- 100; sigma <- 20      #### Assign the mean and standard deviation

#### Normal(mu,sigma)

par(mfrow=c(2,2))         #### Set plots so that there are 2 "rows", 2 "cols"

#### Loop through the 4 "sample sizes"
for (i in 1:length(n.sim)) {
x <- rnorm(n.sim[i],mu,sigma)     ### Draw a sample of n.sim[i] from N(mu,sigma)
print(cbind(n.sim[i],mean(x),sd(x)))     ### Print Sample size, sample mean, sample SD
print(quantile(x,c(.025,0.25,0.50,0.75,0.975)))  ### print sample quantiles

### put limits on y-axis, so top of density and histogram don't get "cut-off"
### dnordnorm(mu,mu,sigma) is the height of density at its peak (mode)
yxlim <- 1.5*n.sim[i]*dnorm(mu,mu,sigma)    

# Histogram of Normal Sample with Normal Super-Imposed 
# Frequency histogram (why we multiply density by sample size and binsize (1))
hist(x,breaks=seq(0,200,1),xlab="x",ylim=c(0,yxlim))
lines(40:160,1*n.sim[i]*dnorm(40:160,mu,sigma),lwd=4)
}  ### End of Loop

print(cbind(mu,sigma))       ### Print theortical mean and SD
qnorm(c(.025,.25,.5,.75,.975),mu,sigma)   ### Print theoretical quantiles

##############################################
par(mfrow=c(1,1))
### Part 2
set.seed(12345)

### Assign sample sizes, means, SDs, and number of simulations
n1 <- 25;  mu1 <- 60; sigma1 <- 10
n2 <- 15;  mu2 <- 50; sigma2 <- 8
n.sim <- 10000

### Assign "blank vectors" to hold results for sample means and SDs
y1.mean <- numeric(n.sim); y1.sd <- numeric(n.sim) 
y2.mean <- numeric(n.sim); y2.sd <- numeric(n.sim)

### Begin looping through samples, saving mean and SD for each group
for (i in 1:n.sim) {
y1 <- rnorm(n1,mu1,sigma1)
y2 <- rnorm(n2,mu2,sigma2)
y1.mean[i] <- mean(y1);   y1.sd[i] <- sd(y1)
y2.mean[i] <- mean(y2);   y2.sd[i] <- sd(y2)
}   ### End Loop

### Compute scaled variance for each group
X1.2 <- (n1-1)*(y1.sd^2)/(sigma1^2)
X2.2 <- (n2-1)*(y2.sd^2)/(sigma2^2) 

### Compute t-value for each group
t1 <- (y1.mean - mu1)/(y1.sd/sqrt(n1))
t2 <- (y2.mean - mu2)/(y2.sd/sqrt(n2))

### Compute F-value for each group
F <- ((y1.sd/sigma1)^2)/((y2.sd/sigma2)^2)

### Print sample means and SDs for scaled variance, t, F
### See notes for theoretical values (based on the relevant degrees of freedom) 
mean(X1.2); sd(X1.2)
mean(t1); sd(t1)
mean(F); sd(F)

### Print Sample quantiles
quantile(X1.2,c(.025,.25,.5,.75,.975))
quantile(t1,c(.025,.25,.5,.75,.975))
quantile(F,c(.025,.25,.5,.75,.975))

### Print theoretical quantiles
qchisq(c(.025,.25,.5,.75,.975),n1-1)
qt(c(.025,.25,.5,.75,.975),n1-1)
qf(c(.025,.25,.5,.75,.975),n1-1,n2-1)

### Histogram of scaled variance 
### Using X1.2[X1.2 <= 100] leaves out very large values that "stretch plot out"
hist(X1.2[X1.2 <= 100],breaks=0:100, 
main=expression(paste("Sampling Distribution of ",X[1]^2)))
lines(0:100,1*10000*dchisq(0:100,n1-1))

### Histogram of t (Only those between -4, 4)
### Multiplies t-density by binsize*n.sin = 0.05*10000
hist(t1[abs(t1) <= 4],breaks=seq(-4,4,.05), 
main=expression(paste("Sampling Distribution of ",t[1])))
lines(seq(-4,4,0.05),0.05*n.sim*dt(seq(-4,4,0.05),n1-1))

### Histogram of F (Only those < 6)
### Multiplies F-density by binsize*n.sinm = 0.05*10000
hist(F[F<=6],breaks=seq(0,6,0.05),main="Sampling Distribution of F")
lines(seq(0,6,0.05),0.05*n.sim*df(seq(0,6,0.05),n1-1,n2-1))


############################################################################

#### Part 3a
### Read file from website 
nhl_ht_wt <- read.csv("http://www.stat.ufl.edu/~winner/data/nhl_ht_wt.csv",
                       header=T)
attach(nhl_ht_wt); names(nhl_ht_wt)


bmi <- 703*Weight/(Height^2)    ### Create bmi from Height and Weight
(N.bmi <- length(bmi))     ### Population size
(mu.bmi <- mean(bmi))      ### Population mean
(sigma.bmi <- sd(bmi)*sqrt((N.bmi-1)/N.bmi))     ### Population SD (Uses N as denominator, not N-1) 

ybar <- numeric(10000); stddev <- numeric(10000)   # Create vectors to save sample means, SDs

for (i in 1:10000) {
y <- sample(bmi, 25)    ### Sample n=25 bmi w/out replacement
ybar[i] <- mean(y)
stddev[i] <- sd(y)
}
X2 <- (25-1)*(stddev/sigma.bmi)^2

mean(ybar); sd(ybar)
mean(X2); sd(X2)

quantile(ybar,c(.025,.25,.5,.75,.975))
quantile(X2,c(.025,.25,.5,.75,.975))
qnorm(c(.025,.25,.5,.75,.975),mu.bmi,sigma.bmi/sqrt(25))
qchisq(c(.025,.25,.5,.75,.975),25-1)

hist(ybar[(ybar >= 24.5) & (ybar <= 28.5)],breaks=seq(24.5,28.5,.01),
xlab="Mean")
lines(seq(24.5,28.5,.01),
0.01*10000*dnorm(seq(24.5,28.5,.01),mu.bmi,sigma.bmi/sqrt(25)),lwd=4)

yx2lim <- 1.1*10000*max(dchisq(0:100,25-1))
hist(X2[X2 <= 100],breaks=0:100, ylim=c(0,yx2lim),
main=expression(paste("Sampling Distribution of ",X^2," - BMI Data")))
lines(0:100,1*10000*dchisq(0:100,25-1))



#### Part 3b
rr.mar <- read.csv("http://www.stat.ufl.edu/~winner/data/rocknroll_marathon_mf2015a.csv",
                     header=T)
attach(rr.mar); names(rr.mar)

f.mph <- mph[Gender=="F"]      ### Subsets Females from population
m.mph <- mph[Gender=="M"]      ### Subsets Males from population

(f.N <- length(f.mph))
(f.mu <- mean(f.mph))
(f.sigma <- sd(f.mph)*sqrt((f.N-1)/f.N))
(m.N <- length(m.mph)) 
(m.mu <- mean(m.mph))
(m.sigma <- sd(m.mph)*sqrt((m.N-1)/m.N)) 

hist(f.mph,ylim=c(0,120),breaks=seq(4,11,1/6),main="Females/Normal")
lines(seq(4,11,1/6),dnorm(seq(4,11,1/6),f.mu,f.sigma)*length(f.mph)/6)

hist(m.mph,ylim=c(0,120),breaks=seq(4,11,1/6),main="Males/Normal")
lines(seq(4,11,1/6),dnorm(seq(4,11,1/6),m.mu,m.sigma)*length(m.mph)/6)

## Begin sampling
set.seed(45678)

ybar.f <- numeric(10000); sd.f <- numeric(10000)
ybar.m <- numeric(10000); sd.m <- numeric(10000)

for (i in 1:10000) {
y.f <- sample(f.mph, 20)
y.m <- sample(m.mph, 20)
ybar.f[i] <- mean(y.f); sd.f[i] <- sd(y.f)
ybar.m[i] <- mean(y.m); sd.m[i] <- sd(y.m)
}

X2.f <- (20-1)*(sd.f**2)/(f.sigma**2)
F.fm <- ((sd.f**2)/(f.sigma**2))/((sd.m**2)/(m.sigma**2))

mean(ybar.f); sd(ybar.f)
quantile(ybar.f,c(.025,.25,.5,.75,.975))
qnorm(c(.025,.25,.5,.75,.975),f.mu,f.sigma/sqrt(20))

mean(X2.f); sd(X2.f)
quantile(X2.f,c(.025,.25,.5,.75,.975))
qchisq(c(.025,.25,.5,.75,.975),20-1)

mean(F.fm); sd(F.fm)
quantile(F.fm,c(.025,.25,.5,.75,.975))
qf(c(.025,.25,.5,.75,.975),20-1,20-1)

dev.off()