Problem 11, chapter 9 op<-par(no.readonly=TRUE) par(mfcol=c(3,1),pin=c(1.8,.9)) f1<-function(x){1-pnorm((3.29-x)/2) + pnorm((-3.29 - x)/2)} curve(f1(x),xlim=c(-15,15),ylim=c(-.2,1)) f2<-function(x){1-pnorm((3.92-x)/2) + pnorm((-3.92 - x)/2)} curve(f2(x),xlim=c(-15,15),ylim=c(-.2,1)) curve(f1(x),xlim=c(-15,15),ylim=c(-.2,1),ylab="both") curve(f2(x),xlim=c(-15,15),ylim=c(-.2,1),add=T) par(op) par(mfcol=c(3,1),pin=c(1.8,.9)) f3<-function(x){1-pnorm(1.645-x) + pnorm(-1.645-x)} curve(f3(x),xlim=c(-15,15),ylim=c(-.2,1)) f4<-function(x){1-pnorm(1.96-x) + pnorm(-1.96-x)} curve(f4(x),xlim=c(-15,15),ylim=c(-.2,1)) curve(f3(x),xlim=c(-15,15),ylim=c(-.2,1),ylab="both") curve(f4(x),xlim=c(-15,15),ylim=c(-.2,1),add=T) par(op) Chapter 11 #40 clouds<-read.table("http://math.mckenna.edu/moneill/Math152/Handouts/CLOUDS.txt",header=T) names(clouds) rm(S) rm(C) attach(clouds) #a) sum(rank(c(S,C))[1:26]) #[1] 824 (824-13*27)/26^2 #[1] 0.6997041 #b) sum(rank(c(sample(S,replace=T),sample(C,replace=T)))[1:26]) #[1] 764 #[1] 832 f<-function(x){seedranksum<-sum(rank(c(sample(S,replace=T),sample(C,replace=T)))[1:26]) (1/26^2)*(seedranksum -26*27/2) } d<-sapply(1:10000,f) hist(d) sd(d) #[1] 0.07417419 #c) 2*.6997041 - d[order(d)[9750]] #[1] 0.5606508 2*.6997041 - d[order(d)[250]] #[1] 0.8461538 #-------------------------------------------------------------------------------------- #Examples on Regression #The basic tool in R is the function "lm". I recommend reading the help page for this function #(after you are familiar with chapter 14 in your text). The details of the optional arguments to the #function and the "see also" section on related functions will be useful. #--------------------------------------- # Historically at least, the first example is: #Father Son heights #father.son<-read.table("http://stat-www.berkeley.edu/users/juliab/141C/pearson.dat",sep=" ")[,-1] father.son<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/fatherson.txt",header=T) attach(father.son) plot(V3 ~ V2, data=father.son,bty="l",pch=20) #Here is the line y=x+1 (dotted) abline(a=1,b=1,lty=2,lwd=2) #and here is the regression line (solid) abline(lm(V3 ~ V2, data=father.son),lty=1,lwd=2) #Check the help page for "par" for the meaning of the options in the above commands. #-------------------------------------------------- #Chapter 14 Problem 2 x<-c(.34,1.38,-.65,.68,1.4,-.88,-.3,-1.18,.5,-1.75) y<-c(.27,1.34,-.53,.35,1.28,-.98,-.72,-.81,.64,-1.59) plot(y~x) lm<-lm(y~x) abline(lm,lty=1,lwd=2) summary(lm) #Call: #lm(formula = y ~ x) #Residuals: # Min 1Q Median 3Q Max #-0.41528 -0.11406 0.03667 0.11680 0.29061 #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) -0.03340 0.07159 -0.467 0.653 #x 0.90441 0.07008 12.905 1.23e-06 *** #--- #Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 #Residual standard error: 0.2261 on 8 degrees of freedom #Multiple R-Squared: 0.9542, Adjusted R-squared: 0.9484 #F-statistic: 166.5 on 1 and 8 DF, p-value: 1.230e-06 lm<-lm(x~y) abline(lm,lty=2,lwd=2) #We can see from the plots that the two lines are not the same. # The coefficients in the second regression (c and d) are obtained from the formulae for the #coefficients #of the first regression (a and b) by interchanging x_i's and y_i's. This is clearly not the same # as determining c and d from a and b by solving for x in terms of y. #Here is the computation for the y~x problem from scratch: #slope beta1<- (10*sum(x*y) - sum(x)*sum(y))/( 10*sum(x^2) - sum(x)^2 ) beta1 #or alternatively (see problem 8) sum((x-mean(x))*(y-mean(y)))/sum((x-mean(x))^2) #then (problem 8 again), the intercept is mean(y) -beta1*mean(x) #Chapter 14 Problem 29 #a) The covariance of X and Y is easily seen to be 1. The variance of X is 1 and the variance of #Y is 1+ beta^2. So the formula follows. #b) c<-c(-.9,-.5,0,.5,.9) beta<-sqrt((1/c)^2 -1) beta #We need to be careful about the "inf" value in the 3rd position. #Just remember that we want zero correlation in the this case. #For this part, the function below should end with "cor(X,Y)" f<-function(j){ X<-rnorm(20) E<-rnorm(20) if(j==3) {Y<- E } else {Y<-(X+ beta[j]*E)*sign(c[j])} cor(X,Y) #plot(X,Y) } f(1) f(2) f(3) f(4) f(5) #Change "20" to "2000" to see the empirical correlations approach the true values. #c) In the function "f" of part b) replace "cor" with "plot" and recompile. # To quiz yourself with 10 randomly chosen plots from the 5 given correlations: d<-sample(1:5,10,replace=TRUE) # and now do f(d[1]), f(d[2]),... Write down your answers. f(d[1]) #etc. #Now evaluate d to see how you did. (1,2,3,4,5)<->(-.9,-.5,0,.5,.9) #chapter 14 Problem 30 X<-rnorm(500) E<-rnorm(500) beta<-sqrt((1/.8)^2 -1) Y<-X + beta*E plot(Y ~ X, bty="l",pch=20) abline(a=0,b=1,lty=2,lwd=2) abline(lm(Y ~ X),lty=1,lwd=2) summary(lm(Y~X)) lm<-lm(Y~X) par(mfrow=c(2,1)) plot(X,resid(lm)) plot(Y,resid(lm)) par(mfrow=c(1,1)) cor(X,resid(lm)) #[1] 9.891134e-18 cor(Y,resid(lm)) #[1] 0.5532878 #The vector of residuals is orthogonal to the vector X (one of the 2 columns of the "design matrix"). #So we see the correlation is (practically) zero. #The vector Y is the orthogonal sum of the fitted values and the residuals, so there is a positive #correlation. The correlation between fitted values and residuals is again zero. #(Since the vector of fitted values lies in the column space of the design matrix). cor(fitted(lm),resid(lm)) #[1] 9.853195e-18 #------------------------------------------------------------------- #Flow rate vs. Depth (pg 551 and etc.) depthflow<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/depthflow.txt",header=T) #Plot Flow vs. Depth plot(Flow~Depth,xlim=c(.2,.8)) #Create an object called "lm" (for "linear model") which contains all the information that can be gleaned #from solving the simple least squares problem for a linear fit. #See the help page for "lm" to find out what functions to use to extract all the information. lm<-lm(Flow~Depth) #Plot the residuals vs. Depth plot(resid(lm)~ Depth,xlim=c(.2,.8)) #The fit doesn't look good, but the regression of the log of the Flow vs. the log of the Depth #gives a better fit. lm<-lm(log(Flow)~log(Depth)) plot(log(Flow)~log(Depth)) plot(resid(lm)~log(Depth)) #Fitting a quadratic function instead of a line amounts to adding a third column to the design matrix. #(The column of x^2 's). Finding the coefficients is still a linear least squares problem. #There is a syntax point here. The right hand term in the formula in "lm" is symbolic for the format of the #formula. The quadratic term can fail to be recognized as part of the special formula syntax unless it is #"protected" inside a call to the identity function "I". #Create the quadratic model object quadm<-lm(Flow~Depth+I(Depth^2)) #Compare with the table in example B on page 543 summary(quadm) #The residuals now show no sign of systematic misfit plot(resid(quadm)~Depth) #And we can graphically compare the predicted values with the observed values as follows: plot(Depth,Flow) points(Depth,predict(quadm),pch=19) #----------------------------------------------------------------- #Cancer mortality (From chapter 7 Problem 57) cancer<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/cancer.txt",header=T) attach(cancer) #Recall that y is the mortality in the given county with population x plot(y~x) lm<-lm(y~x) plot(resid(lm)~x) #The plot of residuals vs. population is very bunched up for small values of x #To get a better view, spread things out by plotting residuals vs. log(x). plot(resid(lm)~log(x)) #It is now clear that the variance of the residuals is not constant but increases with x. #We would like to be able to use our standard model (constant variance of the "noise"). #To that end, we can attempt to transform the data so that the residuals will behave better. plot(sqrt(y)~sqrt(x)) lm<-lm(sqrt(y)~sqrt(x)) plot(resid(lm)~sqrt(x)) # We see that the regression of sqrt(y) on sqrt(x) has residuals which appear to have an approximately constant #variance. #Now go back to the basic linear model (y vs. x) lm<-lm(y~x) #Notice that the residuals do not appear to be normally distributed by checking a normal probability plot plot(resid(lm)[order(resid(lm))]~qnorm((1/(length(x)+1))*(1:length(x)))) #or qqnorm(resid(lm)) # Now try the same thing with the sqrt transformation and observe the improvement in the linearity of the normal #probability plot. lm<-lm(sqrt(y)~sqrt(x)) qqnorm(resid(lm)) #-----------------------------------------------------------------