# STA 5106 class web site # http://www.stat.ufl.edu/~jbooth/ # # Course text: "Introductory Statistics with R" # by Peter Dalgaard (2002), Springer # ISBN: 0-387-95475-9 # # To install R (for Windows) download "rw1070.exe" from CRAN website # http://cran.us.r-project.org/ # # To install emacs (for Windows) download "emacs-21.3-bin-i386.tar.gz" # or "em213b*.zip" from http://ftp.gnu.org/gnu/windows/emacs/latest # # To install ESS (Emacs Speaks Statistics) download # "ess-5.1.24.zip" from http://software.biostat.washington.edu/ess/ # # Two ways to start up R # 1. After installing R, double click on R icon # 2. After installing R, emacs and ESS, type "M-x R" in emacs # # GRADING: Email a text file containing solutions to assigned # problems from the text by 6/6/03. # # CHAPTER 1: Basics # # FIRST STEPS # Overgrown calculator 1+1 log(2) log(2,10) log(2,2) exp(2) # Simulation of random variables rnorm(10) # Assignments x <- rnorm(100) # Basic plot functions plot(x) hist(x) # Some variable names are already assign; e.g. t,T,F,c,etc. # Defining data vectors using the construct c(...) weight <- c(60,72,57,90,95,72) # kilograms weight height <- c(1.75,1.80,1.65,1.90,1.74,1.91) # meters bmi <- weight/height^2 # body mass index bmi # Calculate mean and sd of weight xbar <- sum(weight)/length(weight) xbar sd <- sqrt(sum((weight-xbar)^2)/(length(weight)-1)) sd mean(weight) sd(weight) # One sample t-test t.test(bmi,mu=22.5) # default is mu=0 # More on plotting plot(height,weight) plot(height,weight,col="red",pch=2) # pch denotes "plotting character" hh <- 1.65+(0:5)*.05 # equally spaced points over range of height hh lines(hh,22.5*hh^2,col="blue",lty=2) # lty denotes "line type" # # R LANGUAGE ESSENTIALS # # R expressions work on variables or "objects" # Many things in R are done using "function calls"; # e.g. log(x) or plot(x,y) # # In the command "plot(height,weight)", height and weight # are called actual arguments. The argument "pch" is a "formal # argument" (with default value 1). Formal arguments are part # of the function definition. args(var) # # The first and second arguments of plot are assumed to be the # x- and y-variables respectively. This called "positional matching" # # The command plot(y=weight,x=height,pch=2,col="red") # uses "keyword matching" of arguments. In this case pch=2 is known # as a "named actual argument" # # Character vectors c("Huey","Dewey","Louie") # Logical vectors c(T,T,F,T) bmi bmi>25 # # Missing values x <- c(0,1,2,3,NA) x sum(x) sum(x,na.rm=T) is.na(x) # # Functions that create vectors c(1,2,3,4,5) 1:5 seq(1,5) seq(1,5,2) rep(2,5) rep(1:2,3) rep(1:2,each=3) rep(1:2,1:2) # # Indexing height[1] weight[2:3] # # Matrices and arrays x <- 1:12 dim(x) <- c(3,4) x matrix(1:12,nrow=3) matrix(1:12,nrow=3,byrow=T) x <- matrix(1:12,nrow=3,byrow=T) rownames(x) <- month.abb[1:3] x x[1:2,3] t(x) cbind(A=1:4,B=5:8,C=9:12) # column bind rbind(A=1:4,B=5:8,C=9:12) # row bind # # Factors pain <- c(0,3,2,2,1) # value of numerical variable for 5 patients pain sum(pain) fpain <- factor(pain,levels=0:3) fpain sum(fpain) levels(fpain) <- c("none","mild","medium","severe") fpain as.numeric(fpain) text.pain <- c("none","severe","medium","medium","mild") factor(text.pain) # defaults is to order levels alphabetically factor(text.pain,levels=c("none","mild","medium","severe")) # # Lists mylist <- list(ht=height,wt=weight) mylist mylist$ht # # Data frames # A list of vectors/factors of the same length # A data matrix with variables in columns # Each row corresponds to an experimental unit d <- data.frame(weight,height) d d$weight # # Conditional selection bmi height bmi[height>1.75] bmi[height==1.75] # == to avoid confusion with keyword assignments in functions # Also <=, >=, != # Other logical operators # & "and", | "or", ! "not" bmi[weight!=72 & height>1.75] weight!=72 & height>1.75 # # Indexing of data frames d d[5,2] d[5,] d[height>1.75,] s <- height>1.75 d[s,] # # subset and transform d2 <- transform(d,bmi=weight/height^2) d2 subset(d2,bmi>20) library(ISwR) # access library of data sets from text data(energy) # energy expenditure of lean and obese women energy names(energy) exp.lean <- energy$expend[energy$stature=="lean"] exp.obese <- energy$expend[energy$stature=="obese"] e <- split(energy$expend,energy$stature) # # Sorting height sort(height) order(height) o <- order(height) height[o] weight[o] # # Implicit loops # list containing mean of each varaible in dataframe lapply(d,mean,na.rm=T) # simplify to vector or matrix sapply(d,mean) # tabulate the value of a function for subgroups defined by a factor tapply(energy$expend,energy$stature,median) # apply function to row or columns of a matrix m <- matrix(runif(20),5) apply(m,2,min) # # THE GRAPHICS SUBSYSTEM # # Plot layout x <- rnorm(50,mean=10,sd=2) y <- 1.5*x+rnorm(50,0,sd=1) plot(x,y,main="Main title",sub="subtitle",xlab="x-label",ylab="y-label") lm.fit <- lm(y~x) lm.fit summary(lm.fit) names(lm.fit) abline(lm.fit$coef,col="blue") text(6,20,"centered text at position (5,20)") text(6,19,"left-justified text at position (5,19)",adj=0) for (s in 1:4) mtext(-1:4,side=s,at=11,line=-2:4) mtext(paste("side",1:4),side=1:4,line=-1,font=2) # boldface font help(mtext) # # Building a plot from pieces plot(x,y,type="n",xlab="",ylab="",axes=F) # type is "null" points(x,y,col="blue",pch=3) axis(1) axis(2,at=c(7,10,15,22),label=c("A","B","C","D")) box() title(main="Main title",sub="subtitle",xlab="x-label",ylab="y-label") # # Use par() for fine control of plots # # Combining plots x <- rgamma(100,shape=3,scale=.5) hist(x) hist(x,freq=F) # area equals 1 curve(dgamma(x,shape=3,scale=.5),add=T) # h <- hist(x,freq=F) ylim <- range(0,h$density,1.05*max(dgamma(x,shape=3,scale=.5))) hist(x,freq=F,ylim=ylim) curve(dgamma(x,shape=3,scale=.5),add=T) # # Controlling multiple graphics windows using dev.***() functions # x11() plot(1:10) x11() plot(rnorm(10)) dev.set(dev.prev()) abline(0,1)# through the 1:10 points dev.set(dev.next()) abline(h=0, col="gray")# for the residual plot dev.set(dev.prev()) dev.off() dev.off()#- close the two X devices # # R PROGRAMMING # # A function to evaluate sqrt(x) using Newton's method # root <- function(x) { if (x<0) stop("argument is negative") y <- x/2 while(abs(y*y-x)>1e-10) y <- (y+x/y)/2 y } root(-2) root(2) root <- function(x) { if (x<0) stop("argument is negative") y <- x/2 repeat { y <- (y+x/y)/2 if (all(abs(y*y-x)<1e-10)) break # all NOT any } # compound expressions inside {} y } root(2) root(2:4) source("root.R") # read in R code from a file root(1:5) # # "for" loops par(mfrow=c(2,2)) for (i in 1:4) hist(rnorm(100)) # # Classes and generic functions # # Objects in R have a "class" attribute h <- t.test(bmi,mu=22.5) # hypothesis class(h) # h is a list names(h) h$p.value # The generic function "print" recognizes the class attribute of # t.test and passes it to the specialized function "print.htest" print(h) h # If the class attribute is "null", "print.default" is used class(bmi) print(bmi) bmi # # WORKSPACE MANAGEMENT # # All created variables are stored in a common workspace ls() # list objects rm(height,weight,bmi) ls() ls # prints ls function code # rm(list=ls()) to clear entire workspace # save.image() to save workspace to .RData in working directory # The .RData file contains the created objects, not the output # save.image("filename.Rdata") to save to filename # When you exit R you are asked in you want save the workspace # The file .RData is automatically loaded when you start R # You can load other saved workspaces using the load() function # # attach() and detach() data(thuesen) plot(thuesen$blood.glucose,thuesen$short.velocity) blood.glucose attach(thuesen) blood.glucose search() # search path detach(thuesen) search() # c <- function(x) { sqrt(x) } c(2) c(1,2) rm(c) c(1,2) # # DATA ENTRY # # Reading in from a text file # # The "read.table" command reads data in ASCII format # into a data frame. It expects the data in column format # possibly with variable (column) names and row names. shuttle.df <- read.table("shuttle-data",header=T) shuttle.df class(shuttle.df$flight) # "read.csv" assumes fields separated by a comma # "read.csv2" semi-colon # "read.delim" tab # # The data editor edit(shuttle.df) # save.image("chapter1.Rdata") q() # # Interfacing to other programs # The "foreign" package reads files from SPSS, Stata, Minitab,... # # DO PROBLEMS 1.1-1.8