week7assignment.R

library("MASS")
data(cats)
str(cats)

## 'data.frame':    144 obs. of  3 variables:
##  $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Bwt: num  2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
##  $ Hwt: num  7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...

summary(cats)

##  Sex         Bwt             Hwt       
##  F:47   Min.   :2.000   Min.   : 6.30  
##  M:97   1st Qu.:2.300   1st Qu.: 8.95  
##         Median :2.700   Median :10.10  
##         Mean   :2.724   Mean   :10.63  
##         3rd Qu.:3.025   3rd Qu.:12.12  
##         Max.   :3.900   Max.   :20.50

with(cats,plot(Bwt,Hwt))
title(main = "Heart weight (g) vs Boday weight (kg) of Domestic cats")

with(cats,plot(Hwt~Bwt))

#The with() function apply an expression to a dataset. with(data, expr) 
#data: a list or a data frame
#expr: one or more expressions to evaluate using the contents of data

plot(Hwt~Bwt,data = cats)
#y~x implies that we want to see how the y-values differ for the various values of x and x~y reverses the roles of x and y. For those with a little math background, y~x means that we want to plot y as a function of x: y=f(x). 
with(cats,cor(Bwt,Hwt))

## [1] 0.8041274

with(cats,cor(Bwt,Hwt))^2

## [1] 0.6466209

with(cats,cor.test(Bwt,Hwt))

## 
##  Pearson's product-moment correlation
## 
## data:  Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7375682 0.8552122
## sample estimates:
##       cor 
## 0.8041274

with(cats,cor.test(Bwt,Hwt,alternative = "greater",conf.level = .8))

## 
##  Pearson's product-moment correlation
## 
## data:  Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is greater than 0
## 80 percent confidence interval:
##  0.7776141 1.0000000
## sample estimates:
##       cor 
## 0.8041274

cor.test( ~Bwt +Hwt, data = cats)

## 
##  Pearson's product-moment correlation
## 
## data:  Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7375682 0.8552122
## sample estimates:
##       cor 
## 0.8041274

cor.test(~ Bwt +Hwt, data = cats,subset = (Sex =="F"))

## 
##  Pearson's product-moment correlation
## 
## data:  Bwt and Hwt
## t = 4.2152, df = 45, p-value = 0.0001186
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2890452 0.7106399
## sample estimates:
##       cor 
## 0.5320497

#The standard method that statisticians use to measure the ‘significance’ of their empirical analyses is the p-value.
#For example, that if there are 100 pairs of data whose correlation coefficient is 0.254, then the p-value is 0.01. This means that there is a 1 in 100 chance that we would have seen these observations if the variables were unrelated.

#95 percent confidence interval 
#Means correlation between 0.7375682 and 0.8552122 since 95% of the time confidence intervals contain the true mean.


with(cats, plot(Bwt, Hwt, type="n", las=1, xlab="Body Weight in kg", ylab="Heart Weight in g",main="Heart Weight vs. Body Weight of Cats"))
with(cats, points(Bwt[Sex=="F"], Hwt[Sex=="F"], pch=16, col="red"))
with(cats, points(Bwt[Sex=="M"], Hwt[Sex=="M"], pch=17, col="blue"))

rm(cats)
#rm() Function
#Used to remove objects. These can be specified successively as character strings, or in the character vector list, or through a combination of both. All objects thus specified will be removed.

data(cement)
str(cement)

## 'data.frame':    13 obs. of  5 variables:
##  $ x1: int  7 1 11 11 7 11 3 1 2 21 ...
##  $ x2: int  26 29 56 31 52 55 71 31 54 47 ...
##  $ x3: int  6 15 8 8 6 9 17 22 18 4 ...
##  $ x4: int  60 52 20 47 33 22 6 44 22 26 ...
##  $ y : num  78.5 74.3 104.3 87.6 95.9 ...

cor(cement)

##            x1         x2         x3         x4          y
## x1  1.0000000  0.2285795 -0.8241338 -0.2454451  0.7307175
## x2  0.2285795  1.0000000 -0.1392424 -0.9729550  0.8162526
## x3 -0.8241338 -0.1392424  1.0000000  0.0295370 -0.5346707
## x4 -0.2454451 -0.9729550  0.0295370  1.0000000 -0.8213050
## y   0.7307175  0.8162526 -0.5346707 -0.8213050  1.0000000

cov(cement)

##           x1         x2         x3          x4          y
## x1  34.60256   20.92308 -31.051282  -24.166667   64.66346
## x2  20.92308  242.14103 -13.878205 -253.416667  191.07949
## x3 -31.05128  -13.87821  41.025641    3.166667  -51.51923
## x4 -24.16667 -253.41667   3.166667  280.166667 -206.80833
## y   64.66346  191.07949 -51.519231 -206.808333  226.31359

cov.matr = cov(cement)
cov2cor(cov.matr)

##            x1         x2         x3         x4          y
## x1  1.0000000  0.2285795 -0.8241338 -0.2454451  0.7307175
## x2  0.2285795  1.0000000 -0.1392424 -0.9729550  0.8162526
## x3 -0.8241338 -0.1392424  1.0000000  0.0295370 -0.5346707
## x4 -0.2454451 -0.9729550  0.0295370  1.0000000 -0.8213050
## y   0.7307175  0.8162526 -0.5346707 -0.8213050  1.0000000

pairs(cement)

ls()

## [1] "cement"   "cov.matr"

rm(cement, cov.matr)
coach1 = c(1,2,3,4,5,6,7,8,9,10)
coach2 = c(4,8,1,5,9,2,10,7,3,6)
#Nominal and numerical data
#Nominal data is data that can be sorted into categories. For example, people can be sorted by hair colour; black, brown, blond, red, other or by gender; male or female. When creating categorising, it's important that each value is only put into one category. 
#Nominal data is categorical data where the order of the categories is arbitrary. For example; hair colour black=1, brown=2, blond=3, red=4, other=5 
#Numeric variables have values that describe a measurable quantity as a number, like 'how many' or 'how much'. Therefore numeric variables are quantitative variables.


cor(coach1,coach2,method = "spearman")

## [1] 0.1272727

cor.test(coach1,coach2,method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  coach1 and coach2
## S = 144, p-value = 0.7329
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1272727

cor(coach1,coach2,method = "kendall")

## [1] 0.1111111

cor.test(coach1,coach2,method = "kendall")

## 
##  Kendall's rank correlation tau
## 
## data:  coach1 and coach2
## T = 25, p-value = 0.7275
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.1111111

ls()

## [1] "coach1" "coach2"

rm(coach1,coach2)

data(cats)
cats[12,2] = NA
cats[101,3] = NA
cats[132,2:3] = NA
summary(cats)

##  Sex         Bwt             Hwt       
##  F:47   Min.   :2.000   Min.   : 6.30  
##  M:97   1st Qu.:2.300   1st Qu.: 8.85  
##         Median :2.700   Median :10.10  
##         Mean   :2.723   Mean   :10.62  
##         3rd Qu.:3.000   3rd Qu.:12.07  
##         Max.   :3.900   Max.   :20.50  
##         NA's   :2       NA's   :2

with(cats, cor(Bwt, Hwt))

## [1] NA

with(cats,cov(Bwt,Hwt))

## [1] NA

with(cats,plot(Bwt,Hwt))

with(cats,cor(Bwt,Hwt, use = "pairwise"))

## [1] 0.8066677

rm(cats)
data(cats)
lm(Hwt ~ Bwt, data = cats)

## 
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
## 
## Coefficients:
## (Intercept)          Bwt  
##     -0.3567       4.0341

lm.out = lm(Hwt~Bwt,data = cats)
lm.out

## 
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
## 
## Coefficients:
## (Intercept)          Bwt  
##     -0.3567       4.0341

summary(lm.out)

## 
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5694 -0.9634 -0.0921  1.0426  5.1238 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.3567     0.6923  -0.515    0.607    
## Bwt           4.0341     0.2503  16.119   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 142 degrees of freedom
## Multiple R-squared:  0.6466, Adjusted R-squared:  0.6441 
## F-statistic: 259.8 on 1 and 142 DF,  p-value: < 2.2e-16

options(show.signif.stars = F)
anova(lm.out)

## Analysis of Variance Table
## 
## Response: Hwt
##            Df Sum Sq Mean Sq F value    Pr(>F)
## Bwt         1 548.09  548.09  259.83 < 2.2e-16
## Residuals 142 299.53    2.11

#ANOVA:The analysis of variance is a commonly used method to determine differences between several samples.
plot(Hwt ~ Bwt, data = cats, main="Kitty cat plot")
abline(lm.out, col = "red")

#abline()
#This function is used for add one or more straight lines through the current plot.


par(mfrow = c(2,2))
plot(lm.out)

cats[144,]

##     Sex Bwt  Hwt
## 144   M 3.9 20.5

lm.out$fitted[144]

##      144 
## 15.37618

lm.out$residuals[144]

##      144 
## 5.123818

par(mfrow = c(1,1))
plot(cooks.distance(lm.out))

lm.without144 = lm(Hwt ~ Bwt, data=cats, subset=(Hwt<20.5))
lm.without144

## 
## Call:
## lm(formula = Hwt ~ Bwt, data = cats, subset = (Hwt < 20.5))
## 
## Coefficients:
## (Intercept)          Bwt  
##       0.118        3.846

rlm(Hwt ~Bwt, data= cats)

## Call:
## rlm(formula = Hwt ~ Bwt, data = cats)
## Converged in 5 iterations
## 
## Coefficients:
## (Intercept)         Bwt 
##  -0.1361777   3.9380535 
## 
## Degrees of freedom: 144 total; 142 residual
## Scale estimate: 1.52

plot(Hwt ~Bwt, data = cats)
lines(lowess(cats$Hwt~ cats$Bwt),col="red")

scatter.smooth(cats$Hwt~cats$Bwt)

scatter.smooth(cats$Hwt ~ cats$Bwt, pch=16, cex=.6)

#Non parametric method
#A statistical method is called non-parametric if it makes no assumption on the population distribution or sample size.

#Technique smoothing
#Smoothing is a statistical technique that helps you to spot trends in noisy data, and especially to compare trends between two or more fluctuating time series.

week7assignment.R

jimmy_000

Thu Apr 07 10:43:11 2016