# pdf("E:\\blue_drive\\Rmisc\\graphs\\yanks1927away_rand.pdf") yanksaway <- read.csv("http://www.stat.ufl.edu/~winner/sta6207/yanks1927away.csv",header=T) attach(yanksaway); names(yanksaway) set.seed(1234) # Set random number generator seed to reproduce samples (t <- max(opp_id)) # t = number of opponents ("treatments") (N <- length(runs)) # N = total number of games (ybar_all <- mean(runs)) # ybar_all = overall sample mean number of runs per game ### Initialize number of games played against each opponent at 0 ### Initialize mean number of runs per game against each opponent at 0 ### Initialize t x 2 matrix to save range of games played against each at 0 ### Initialize the sum of squares between opponents at 0 n <- rep(0,t) ybar <- rep(0,t) teamrange <- matrix(rep(0,2*t),ncol=2) sumsq_obs <- 0 ### Determine which rows of data frame are associated with each opponent ### n[i] is the number of games played against team i ### For opponent 1, the range is 1:n[1] ### ==> teamrange[1,1]=1 and teamrange[1,2]=n[1] ### For opponent 2, the range is (n[1]+1):(n[1]+n[2]) ### ==> teamrange[2,1]=n[1]+1 and teamrange[2,2]=n[1]+n[2] ... ### ybar[i] is the mean of the variable "runs" in the rows for team i ### sumsq_obs increments SSBetween = sum(n[i]*(ybar[i]-ybar_all)^2) for (i in 1:t) { n[i] <- length(runs[opp_id==i]) # Count numbers of obs or runs when opp_id=1 if (i==1) { teamrange[i,1] <- 1 teamrange[i,2] <- n[i] } else { teamrange[i,1] <- teamrange[(i-1),2]+1 teamrange[i,2] <- teamrange[(i-1),2]+n[i] } ybar[i] <- mean(runs[teamrange[i,1]:teamrange[i,2]]) sumsq_obs <- sumsq_obs + n[i]*(ybar[i]-ybar_all)^2 } teamrange ybar sumsq_obs ### Begin re-sampling to perform Permutation Test ### Choose number of samples to resample = 10000-1=9999 to augment observed sample num_samp <- 10^5-1 ### Initialize vector of sum of sqares for each sample at 0 sumsq_rand <- rep(0,num_samp) ### For each sample, obtain a random permutation of the integers 1:N ### randgrp contains the random permutation of N integers ### Compute ybar[i] as the mean of the games within the range for opponent i ### Compute the sum of squares for each sample and save result to vector for (i1 in 1:num_samp) { randgrp <- sample(N,size=N,replace=FALSE) if (i1 == 1) print(cbind(1:N,randgrp)) # Print the first permutation sumsq <- 0 for (i2 in 1:t) { ybar[i2] <- mean(runs[randgrp[teamrange[i2,1]:teamrange[i2,2]]]) sumsq <- sumsq + n[i2]*(ybar[i2]-ybar_all)^2 } sumsq_rand[i1] <- sumsq } summary(sumsq_rand) # Give the summary of the sum of squares ### Compute the p-value for the observed sum of squares ### Count the number of permutation samples with sum of squares >= observed ### Add 1 to include the observed sample in numerator and denominator (p_rand <- (sum(sumsq_rand >= sumsq_obs)+1)/(num_samp+1)) hist(sumsq_rand,xlab="Opponent Sum of Squares", main="Randomization Distribution for Runs Scored by Opponent") abline(v=sumsq_obs) # dev.off()