Problem 11, chapter 9

op<-par(no.readonly=TRUE)

par(mfcol=c(3,1),pin=c(1.8,.9))


f1<-function(x){1-pnorm((3.29-x)/2) + pnorm((-3.29 - x)/2)}

 curve(f1(x),xlim=c(-15,15),ylim=c(-.2,1))


f2<-function(x){1-pnorm((3.92-x)/2) + pnorm((-3.92 - x)/2)}

curve(f2(x),xlim=c(-15,15),ylim=c(-.2,1))



 curve(f1(x),xlim=c(-15,15),ylim=c(-.2,1),ylab="both")
curve(f2(x),xlim=c(-15,15),ylim=c(-.2,1),add=T)

par(op)

par(mfcol=c(3,1),pin=c(1.8,.9))


f3<-function(x){1-pnorm(1.645-x) + pnorm(-1.645-x)}

 curve(f3(x),xlim=c(-15,15),ylim=c(-.2,1))


f4<-function(x){1-pnorm(1.96-x) + pnorm(-1.96-x)}

curve(f4(x),xlim=c(-15,15),ylim=c(-.2,1))


curve(f3(x),xlim=c(-15,15),ylim=c(-.2,1),ylab="both")
curve(f4(x),xlim=c(-15,15),ylim=c(-.2,1),add=T)

par(op)


Chapter 11 #40

clouds<-read.table("http://math.mckenna.edu/moneill/Math152/Handouts/CLOUDS.txt",header=T)
names(clouds)
rm(S)
rm(C)
attach(clouds)

#a)

sum(rank(c(S,C))[1:26])
#[1] 824

(824-13*27)/26^2
#[1] 0.6997041

#b)

sum(rank(c(sample(S,replace=T),sample(C,replace=T)))[1:26])
#[1] 764
#[1] 832

f<-function(x){seedranksum<-sum(rank(c(sample(S,replace=T),sample(C,replace=T)))[1:26])

(1/26^2)*(seedranksum -26*27/2)

}

d<-sapply(1:10000,f)

hist(d)

sd(d)
#[1] 0.07417419

#c)

2*.6997041 - d[order(d)[9750]]
#[1] 0.5606508

2*.6997041 - d[order(d)[250]]
#[1] 0.8461538

#--------------------------------------------------------------------------------------
#Examples on Regression

#The basic tool in R is the function "lm". I recommend reading the help page for this function
#(after you are familiar with chapter 14 in your text). The details of the optional arguments to the 
#function and the "see also" section on related functions will be useful.

#---------------------------------------
# Historically at least, the first example is:

#Father Son heights
#father.son<-read.table("http://stat-www.berkeley.edu/users/juliab/141C/pearson.dat",sep=" ")[,-1]
father.son<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/fatherson.txt",header=T)

attach(father.son)

plot(V3 ~ V2, data=father.son,bty="l",pch=20)

#Here is the line y=x+1 (dotted)
 abline(a=1,b=1,lty=2,lwd=2)

#and here is the regression line (solid)

 abline(lm(V3 ~ V2, data=father.son),lty=1,lwd=2)

#Check the help page for "par" for the meaning of the options in the above commands.
#--------------------------------------------------

#Chapter 14 Problem 2

x<-c(.34,1.38,-.65,.68,1.4,-.88,-.3,-1.18,.5,-1.75)

y<-c(.27,1.34,-.53,.35,1.28,-.98,-.72,-.81,.64,-1.59)

plot(y~x)

lm<-lm(y~x)

abline(lm,lty=1,lwd=2)

summary(lm)

#Call:
#lm(formula = y ~ x)

#Residuals:
#     Min       1Q   Median       3Q      Max 
#-0.41528 -0.11406  0.03667  0.11680  0.29061 

#Coefficients:
#            Estimate Std. Error t value Pr(>|t|)    
#(Intercept) -0.03340    0.07159  -0.467    0.653    
#x            0.90441    0.07008  12.905 1.23e-06 ***
#---
#Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

#Residual standard error: 0.2261 on 8 degrees of freedom
#Multiple R-Squared: 0.9542,     Adjusted R-squared: 0.9484 
#F-statistic: 166.5 on 1 and 8 DF,  p-value: 1.230e-06 

lm<-lm(x~y)

abline(lm,lty=2,lwd=2)

#We can see from the plots that the two lines are not the same.
# The coefficients in the second regression (c and d) are obtained from the formulae for the #coefficients
#of the first regression (a and b) by interchanging x_i's and y_i's. This is clearly not the same 
# as determining c and d from a and b by solving for x in terms of y.

#Here is the computation for the y~x problem from scratch:

#slope

beta1<- (10*sum(x*y) - sum(x)*sum(y))/( 10*sum(x^2) - sum(x)^2 )

beta1

#or alternatively (see problem 8)

sum((x-mean(x))*(y-mean(y)))/sum((x-mean(x))^2)

#then (problem 8 again), the intercept is

mean(y) -beta1*mean(x)

#Chapter 14 Problem 29

#a) The covariance of X and Y is easily seen to be 1. The variance of X is 1 and the variance of 
#Y is 1+ beta^2. So the formula follows.

#b)

c<-c(-.9,-.5,0,.5,.9)

beta<-sqrt((1/c)^2 -1)

beta

#We need to be careful about the "inf" value in the 3rd position.
#Just remember that we want zero correlation in the this case.
#For this part, the function below should end with "cor(X,Y)"

f<-function(j){

X<-rnorm(20)
E<-rnorm(20)

if(j==3) {Y<- E } else {Y<-(X+ beta[j]*E)*sign(c[j])}

cor(X,Y)
#plot(X,Y)
}

f(1)
f(2)
f(3)
f(4)
f(5)

#Change "20" to "2000" to see the empirical correlations approach the true values.

#c) In the function "f" of part b) replace "cor" with "plot" and recompile.

# To quiz yourself with 10 randomly chosen plots from the 5 given correlations:

d<-sample(1:5,10,replace=TRUE)

# and now do f(d[1]), f(d[2]),... Write down your answers.

f(d[1]) #etc.

#Now evaluate d to see how you did. (1,2,3,4,5)<->(-.9,-.5,0,.5,.9)

#chapter 14 Problem 30

X<-rnorm(500)
E<-rnorm(500)

beta<-sqrt((1/.8)^2 -1)

Y<-X + beta*E

plot(Y ~ X, bty="l",pch=20)
 abline(a=0,b=1,lty=2,lwd=2)
 abline(lm(Y ~ X),lty=1,lwd=2)



summary(lm(Y~X))

lm<-lm(Y~X)

par(mfrow=c(2,1))

plot(X,resid(lm))
plot(Y,resid(lm))

par(mfrow=c(1,1))

cor(X,resid(lm))
#[1] 9.891134e-18
 cor(Y,resid(lm))
#[1] 0.5532878


#The vector of residuals is orthogonal to the vector X (one of the 2 columns of the "design matrix").
#So we see the correlation is (practically) zero.
#The vector Y is the orthogonal sum of the fitted values and the residuals, so there is a positive 
#correlation. The correlation between fitted values and  residuals is again zero.
#(Since the vector of fitted values lies in the column space of the design matrix).

cor(fitted(lm),resid(lm))
#[1] 9.853195e-18

#-------------------------------------------------------------------

#Flow rate vs. Depth (pg 551 and etc.)

depthflow<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/depthflow.txt",header=T)

#Plot Flow vs. Depth

plot(Flow~Depth,xlim=c(.2,.8))

#Create an object called "lm" (for "linear model") which contains all the information that can be gleaned 
#from solving the  simple least squares problem for a linear fit.
#See the help page for "lm" to find out what functions to use to extract all the information.

lm<-lm(Flow~Depth)

#Plot the residuals vs. Depth

plot(resid(lm)~ Depth,xlim=c(.2,.8))

#The fit doesn't look good, but the regression of the log of the Flow vs. the log of the Depth 
#gives a better fit.

lm<-lm(log(Flow)~log(Depth))

plot(log(Flow)~log(Depth))

plot(resid(lm)~log(Depth))


#Fitting a quadratic function  instead of a line amounts to adding a third column to the design matrix.
#(The column of x^2 's). Finding the coefficients is still a linear least squares problem.
#There is a syntax point here. The right hand  term in the formula in "lm" is symbolic for the
 format of the #formula. The quadratic term can fail to be recognized as part of the special 
formula syntax unless it is #"protected" inside a call to the identity function "I".

#Create the quadratic model object

quadm<-lm(Flow~Depth+I(Depth^2))

#Compare with the table in example B on page 543

summary(quadm)

#The residuals now show no sign of systematic misfit

plot(resid(quadm)~Depth)

#And we can  graphically compare the predicted values with the observed values as follows:

 plot(Depth,Flow)
 points(Depth,predict(quadm),pch=19)

#-----------------------------------------------------------------

#Cancer mortality (From chapter 7 Problem 57)

cancer<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/cancer.txt",header=T)

attach(cancer)

#Recall that y is the mortality in the given county with population x

plot(y~x)

lm<-lm(y~x)

plot(resid(lm)~x)

#The plot of residuals vs. population is very bunched up for small values of x
#To  get a better view, spread things out by plotting residuals vs. log(x).

plot(resid(lm)~log(x))

#It is now clear that the variance of the residuals is not constant but increases with x.
#We would like to be able to use our standard model (constant variance of the "noise").
#To that end, we can attempt to transform the data so that the residuals will behave better.

plot(sqrt(y)~sqrt(x))


lm<-lm(sqrt(y)~sqrt(x))

plot(resid(lm)~sqrt(x))

# We see that the regression of sqrt(y) on sqrt(x) has residuals which appear to have an approximately constant #variance.

#Now go back to the basic linear model (y vs. x)

lm<-lm(y~x)

#Notice that the residuals do not appear to be normally distributed by checking a normal probability plot

plot(resid(lm)[order(resid(lm))]~qnorm((1/(length(x)+1))*(1:length(x))))

#or

qqnorm(resid(lm))

# Now try the same thing with the sqrt transformation and observe the improvement in the linearity of the normal #probability plot.

lm<-lm(sqrt(y)~sqrt(x))

qqnorm(resid(lm))
#-----------------------------------------------------------------