vitaminc<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/VITAMINC.txt",header=T,na.strings="*")


#a)
NSh0<-h0[Group==1]
NSh0

NSh2<-h2[Group==1]

Sh0<-h0[Group==2]

Sh2<-h2[Group==2]

par(mfrow=c(2,3))
hist(Sh0,xlim=c(0,2),prob=T)
hist(Sh2,xlim=c(0,2.5),prob=T)
hist(Sh2-Sh0,xlim=c(0,2),prob=T)
hist(NSh0,xlim=c(0,2),ylim=c(0,2),prob=T,breaks=10)
hist(NSh2,xlim=c(0,2.5),prob=T,breaks=10)
hist(NSh2-NSh0,xlim=c(0,2),ylim=c(0,1.5),prob=T,breaks=10)

par(mfrow=c(3,1))
boxplot(Sh0,NSh0)
boxplot(Sh2,NSh2)
boxplot(Sh2-Sh0,NSh2-NSh0)

par(mfrow=c(1,1))
#The non-schizophrenic group appears to be higher in all three comparisons.

#b)
t.test(Sh0,NSh0)
t.test(Sh2,NSh2)
t.test(Sh2-Sh0,NSh2-NSh0)

#c)
wilcox.test(Sh0,NSh0)
wilcox.test(Sh2,NSh2)
wilcox.test(Sh2-Sh0,NSh2-NSh0)

#d)
NST<-UrTot[Group==1]
ST<-UrTot[Group==2]

NSmpk<-URmpk[Group==1]
Smpk<-URmpk[Group==2]

summary(Smpk)
summary(NSmpk)
summary(NST)
summary(ST)


boxplot(Smpk,NSmpk)
boxplot(ST,NST)


par(mfrow=c(2,2))

hist(Smpk)
hist(ST)
hist(NSmpk)
hist(NST)


#The data do not look normally distributed. The boxplots and histograms are skewed.
#It appears that the non-schizophrenic group has a higher excretion rate in both comparisons.

#e)

t.test(Smpk,NSmpk)
t.test(ST,NST)

#The sample sizes are small and the distributions are skewed. The non-parametric methods are more appropriate 
#here even though the t.test may be somewhat robust against the departure from normality.
#Normality should certainly not be assumed.
#f)

wilcox.test(Smpk,NSmpk)
wilcox.test(ST,NST)

#Neither t.test woould have rejected the null hypothesis that the mean values were the same at the 5% level.
#The wilcoxon test rejects both null hypotheses at the 2% level and backs up the graphical observation  
#that the Schizophrenic group has a lower average rate of urinary excretion of vitamin C.


#g)
SP<-Plasma[Group==2&!is.na(Plasma)]
SU<-Urine[Group==2&!is.na(Urine)]
CP<-Plasma[Group==1]
CU<-Urine[Group==1]

par(mfrow=c(2,1))
hist(SP)
hist(CP)

hist(SU,xlim=c(0,300))
hist(CU)

boxplot(SP,CP)
boxplot(SU,CU)

shapiro.test(SP)
shapiro.test(SU)
shapiro.test(CP)
shapiro.test(CU)

#From the graphics and the Shapiro test, it seems safe to use the t.test. i.e. the data do not depart too
# far from normality.Note: This type of use of the Shapiro test for small samples should be made in 
#conjunction with graphical evidence. The power of the test may be low.



#It seems from the graphics that the Plasma concentrations are similar but the Urinary concentrations are 
#significantly different. There is a noticeable difference in the variability of the quantities between the 
#groups.



#h)

t.test(SP,CP)
t.test(SU,CU)



#i)

wilcox.test(SP,CP)
wilcox.test(SU,CU)

#The tests reach the same conclusions. It seems that the ranks come out perfectly for the Plasma 
#concentrations. The schizophrenic group has  significantly lower concentration of  vitamin C in urine.


sum(rank(c(SP,CP))[1:15])
#[1] 232
sum(rank(c(SP,CP))[16:30])
#[1] 233


#I don't know enough biology to know what this all means. Maybe schizophrenics sweat more on the average?


#IRON pg. 428

iron<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/IRON.txt",header=T,na.strings="*")
iron
#In this example we compare the 1.2 millimolar concentrations of the (3+) form U2, with the same concentration of 
#the (2+) form V2.

attach(iron)

op<-par(no.readonly=TRUE)
par(mfcol=c(2,2),pin=c(2,2))

qqnorm(U2)
qqnorm(V2)
hist(U2)
hist(V2)
par(op)

#Both distributions appear to be skewed and may contain outliers.
boxplot(U2,V2)

t.test(U2,V2)



#log transformation

boxplot(log(U2),log(V2))

op<-par(no.readonly=TRUE)
par(mfcol=c(2,2),pin=c(2,2))

qqnorm(log(U2))
qqnorm(log(V2))
hist(log(U2))
hist(log(V2))
par(op)

t.test(log(U2),log(V2))

 #Welch Two Sample t-test

#data:  log(U2) and log(V2) 
#t = -0.9173, df = 33.373, p-value = 0.3656
#alternative hypothesis: true difference in means is not equal to 0 
#95 percent confidence interval:
#-0.6074408  0.2298013 
#sample estimates:
#mean of x mean of y 
#1.901225  2.090045 

# Assuming the distibutions of U2 and V2 are the same, (-.61,.23) is 95 percent C.I. for 
#log(mu_U2) -log(mu_V2)
#therefore:

c(exp(-.61),exp(.23))


#is an approximate 95% C.I. for the ratio (mu_U2/mu_V2)

mean(U2)/mean(V2)

#The nonparametric method (see page 442)

wilcox.test(U2,V2,conf.int=T)

#        Wilcoxon rank sum test

#data:  U2 and V2 
#W = 133, p-value = 0.3717
#alternative hypothesis: true mu is not equal to 0 
#95 percent confidence interval:
# -3.67  1.64 
#sample estimates:
#difference in location 
                -1.115 




#------------------------------------------------------------------------------------------
# MERCURY
mercury<-read.csv("C:/Documents and Settings/Mike/Desktop/Math 152/MERCURY.txt",header=T)
mercury<-read.csv("http://math.cmc.edu/moneill/Math152/Handouts/MERCURY.txt",header=T)
mercury<-read.table("fishmercury.txt",header=T)

#Note: One of the data points may have a misplaced decimal

mercury
attach(mercury)
P<-permanganate
SR<-reduction
Diff<-P-SR
Diff
#[1]  0.07  0.07  0.00 -0.04  0.10 -0.05  0.11  0.35  0.36  0.19 -0.05 -0.01
#[13]  0.02 -0.01 -0.02  0.01  0.08 -0.02  0.03 -0.13  0.02 -0.04  0.01  0.04
#[25] -0.08

srank<-sign(Diff[Diff!=0])*rank(abs(Diff[Diff!=0]))
srank
#[1]  16.0  15.0 -10.5  19.0 -13.5  20.0  23.0  24.0  22.0 -13.5  -2.5   5.0
#[13]  -2.5  -7.0   2.5  17.5  -7.0   9.0 -21.0   7.0 -12.0   2.5  10.5 -17.5


#I don't know why R won't average ALL the tied values, but it should not change the statistic significantly.

sum(srank[srank>0])
#[1] 193
sum(srank[srank<0])
#[1] -107

wilcox.test(SR,P,paired=T,correct=F)

 #       Wilcoxon signed rank test

#data:  SR and P 
#V = 107, p-value = 0.2188
#alternative hypothesis: true mu is not equal to 0 

#Warning messages:
#1: cannot compute exact p-value with ties in: wilcox.test.default(SR, P, paired = T, correct = F) 
#2: cannot compute exact p-value with zeroes in: wilcox.test.default(SR, P, paired = T, correct = F) 


#What about the t-test?
op<-par(no.readonly=TRUE)
par(mfcol=c(2,2),pin=c(2,2))

qqnorm(SR)
qqnorm(P)
hist(SR)
hist(P)
par(op)

shapiro.test(SR)
shapiro.test(P)

#The P data has a heavy right tail but the sample size is large enough that we may still expect the 
#t.test to be useful. (See lecture notes for some general rules of thumb).

t.test(SR,P, paired=T)

# Paired t-test

#data:  SR and P 
#t = -1.7448, df = 24, p-value = 0.09382
#alternative hypothesis: true difference in means is not equal to 0 
#95 percent confidence interval:
# -0.088189837  0.007389837 
#sample estimates:
#mean of the differences 
#                -0.0404 


# We may not conclude that the methods are systematically different from either of the tests. But the heavy tailed 
#distribution of P may be more than just an aberration.

plot(P,Diff)

#------------------------------------------------------------------------------------

#Problem 22 chapter 11
calcium<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/CALCIUM.txt",header=T,na.strings="*")

attach(calcium)

plot(Oxalate[order(Oxalate)],Flame[order(Flame)])
id<-function(x){x}
curve(id,add=T)

op<-par(no.readonly=TRUE)
par(mfcol=c(2,1),pin=c(2,2))

hist(Oxalate)
hist(Flame)

hist(log(Oxalate))
hist(log(Flame))

par(op)

boxplot(Oxalate,Flame)

summary(Oxalate)
summary(Flame)


plot(1:118,Oxalate-Flame)
barplot(Oxalate-Flame)

t.test(Oxalate,Flame,paired=T)
t.test(log(Oxalate),log(Flame),paired=T)
wilcox.test(Oxalate,Flame,conf.int=T,paired=T)

#There is a systematic difference in the two methods with the Oxalate method tending to produce values 
#which are larger on the average by about .035

#--------------------------------------------------------------------------------------------
#Problem 29 chapter 11 and the haiku data from the Amabile study.
#haiku<-read.csv("U:/Math 152/haiku.txt",header=T)
#haiku<-read.csv("C:/Documents and Settings/Mike/Desktop/Math 152/haiku.txt",header=T)
 haiku<-read.csv("http://math.cmc.edu/moneill/Math152/Handouts/haiku.txt",header=T)

attach(haiku)
names(haiku)

score.int<-SCORE[TREATMENT=="INTRINSIC"]

score.ext<-SCORE[TREATMENT=="EXTRINSIC"]

boxplot(SCORE~TREATMENT)

t.test(score.int,score.ext,var.equal=T)

wilcox.test(score.int,score.ext,conf.int=T)

# A permutation test

mean(score.int) - mean(score.ext)

f<-function(x){l<-sample(1:47,47)
 mean(SCORE[l][1:24])-mean(SCORE[l][25:47])}

 d<-sapply(1:10000,f)

hist(d)

d[order(d)][9950]

d[order(d)][50]
#-------------------------------------------------------------------------------------------------
#Problem 35 chapter 11
ratozone<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/RATOZONE.txt",header=T,na.strings="*")

names(ratozone)

attach(ratozone)

Oz<-Oz[!is.na(Oz)]

op<-par(no.readonly=TRUE)
par(mfcol=c(2,1),pin=c(2.5,2.5))
hist(Oz)
hist(C)

par(op)

boxplot(Oz,C)

qqnorm(Oz)

qqnorm(C)

shapiro.test(C[order(C)][2:23])
shapiro.test(Oz)

wilcox.test(Oz,C,conf.int=T)

t.test(Oz,C)

qqplot(C,Oz)
curve(id,add=T)

# There is little doubt of a treatment effect. Ozone exposure appears to cause a decrease in weight gain
#but strangely enough seems to increase weight gain for the subjects with the largest gains.
#The treatment effect certainly does not seem to be additive, but see the discussion in Doksum and Sievers
#(available on my webpage).
#-------------------------------------------------------------------------------------------

#Problem 47  chapter 11
france<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/FRANCE.txt",header=T,na.strings="*")

names(france)
detach(telephon)
france
Control

 seedtarg<-Target[Group==1]
 unseedtarg<-Target[Group==2]

 seedcon<-Control[Group==1]
 unseedcon<-Control[Group==2]


op<-par(no.readonly=TRUE)
par(mfcol=c(2,2),pin=c(2,2))

hist(seedtarg)
hist(unseedtarg)

hist(seedcon)
hist(unseedcon)

par(op)


t.test(seedtarg,unseedtarg)
wilcox.test(seedtarg,unseedtarg)

op<-par(no.readonly=TRUE)
par(mfcol=c(2,1),pin=c(2,2))

hist(seedtarg-seedcon,xlim=c(-15,50))
hist(unseedtarg-unseedcon,xlim=c(-15,50))

par(op)

sd(seedtarg)
sd(unseedtarg)

sd(seedtarg-seedcon)
sd(unseedtarg-unseedcon)

t.test(seedtarg-seedcon,unseedtarg-unseedcon)
wilcox.test(seedtarg-seedcon,unseedtarg-unseedcon,conf.int=T)


#The seeding seems to have had no observable effect. The non-parametric method is more appropriate here 
# because of the skewed distributions of the rainfall. The investigators tried to alleviate the skewness
# by testing the squareroots of the data. Histograms show that this may have been effective.

#remark:
#Whether the tests tell us anything or not, the other possible combinations of two sample tests
#in this problem present some examples of wide variation between the results of the parametric and 
#non-parametric methods. The reason for the large difference is the skewness of the distributions.
# In most cases one should prefer non-parametric tests in the presence of moderately large but skewed datasets.

#For some interesting commentary on cloud seeding see e.g.
#http://rams.atmos.colostate.edu/gkss.html
#http://www.nawcinc.com/wmfaq.html


#----------------------------------------------------------------------------------------------------

#Problem 54 Chapter 11

steel<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/STEEL.txt",header=T,na.strings="*")
attach(steel)
hist(Ox)
mean(Ox)
#Is this data normally distributed?

#histogram and density fit
hist(Ox,prob=T)
curve(dnorm(x,mean=mean(Ox),sd=sd(Ox)),add=T)

#normal probability plot
qqnorm(Ox)

#and by bare hands
length(Ox)

plot(qnorm((1/41)*(1:40)),Ox[order(Ox)])

#Checking the correlation coefficient for this plot
cor(qnorm((1/41)*(1:40)),Ox[order(Ox)])
#From Filliben, this is between the 75th and 90th percentile.

# The similar built in test named for Shapiro
shapiro.test(Ox)

#Hanging rootogram and chigram

hist(Ox)

phat<- pnorm((3 + .5*(1:9) -mean(Ox))/sd(Ox)) - pnorm((3 + .5*(0:8) -mean(Ox))/sd(Ox))
nhat<-length(Ox)*phat

n<-function(j){length(Ox[3 + (j-1)*.5 <Ox & Ox<=3+ j*.5])}

n<-sapply(1:9,n)

barplot(sqrt(n)-sqrt(nhat))
barplot((n-nhat)/sqrt(nhat))

#Pearson chi squared test (even though it may seem wasteful of information...)
n

npool<-c(6,5,5,6,6,8,4)

nhat

nhatpool<-c(sum(nhat[1:2]),nhat[3:7],sum(nhat[8:9]))
 
sum((nhatpool - npool)^2/nhatpool)

1-pchisq(3.780810,df=4)

#We have no reason to doubt that the data is approximately normally distributed but should
#follow a standard procedure of examing data plotted in order it was collected.
# Generally speaking, we should plot data in as many ways as possible in search of patterns which may not have 
#been noticed by the investigators.

#Plot the data in the order it was obtained

plot(1:40,Ox)

#Distinguish between days by connecting points in each day
for(j in 1:5) lines(order(D)[D==j],Ox[D==j])
#There seems to be a downward trend within each day

#Plot all data by order in which  it was obtained within the day

plot(1:9,3:11,"n") # An empty plot
for(i in 1:9) for(j in 1:5) points(i,Ox[D==j][i])

#The evident downward trend may have (probably did) introduce bias and certainly increased the variance of the 
#measurements obtained. Since the measurements were randomized over time, each bar had an equally likely chance 
#of being measured in any of the time intervals. We may conclude that the downward trend is caused by time 
#evolution of laboratory conditions which occurred each day.Without randomization in time we would not know the 


#Problem 40 chapter 10

clouds<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/CLOUDS.txt",header=T)
names(clouds)
rm(S)
rm(C)
attach(clouds)

qqplot(C,S)
id<-function(x){x}
curve(id,add=T)

#same as

plot(C[order(C)],S[order(S)])
id<-function(x){x}
curve(id,add=T)

qqplot(log(C),log(S))
id<-function(x){x}
curve(id,add=T)

boxplot(C,S)
boxplot(log(C),log(S))

qqnorm(C)
qqnorm(log(C))

qqnorm(S)
qqnorm(log(S))

t.test(log(S),log(C))
t.test(log(S),log(C),var.equal=T)


 





Chapter 11 #42

clouds<-read.table("http://math.cmc.edu/moneill/Math152/Handouts/CLOUDS.txt",header=T)
names(clouds)
rm(S)
rm(C)
attach(clouds)

#a)

sum(rank(c(S,C))[1:26])
#[1] 824

(824-13*27)/26^2
#[1] 0.6997041

#b)

sum(rank(c(sample(S,replace=T),sample(C,replace=T)))[1:26])
#[1] 764
#[1] 832

f<-function(x){seedranksum<-sum(rank(c(sample(S,replace=T),sample(C,replace=T)))[1:26])

(1/26^2)*(seedranksum -26*27/2)

}

d<-sapply(1:10000,f)

hist(d)

sd(d)
#[1] 0.07417419

#c)

2*.6997041 - d[order(d)[9750]]
#[1] 0.5606508

2*.6997041 - d[order(d)[250]]
#[1] 0.8461538