library("MASS")
data(cats)
str(cats)
## 'data.frame': 144 obs. of 3 variables:
## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
summary(cats)
## Sex Bwt Hwt
## F:47 Min. :2.000 Min. : 6.30
## M:97 1st Qu.:2.300 1st Qu.: 8.95
## Median :2.700 Median :10.10
## Mean :2.724 Mean :10.63
## 3rd Qu.:3.025 3rd Qu.:12.12
## Max. :3.900 Max. :20.50
with(cats,plot(Bwt,Hwt))
title(main = "Heart weight (g) vs Boday weight (kg) of Domestic cats")

with(cats,plot(Hwt~Bwt))

#The with() function apply an expression to a dataset. with(data, expr)
#data: a list or a data frame
#expr: one or more expressions to evaluate using the contents of data
plot(Hwt~Bwt,data = cats)
#y~x implies that we want to see how the y-values differ for the various values of x and x~y reverses the roles of x and y. For those with a little math background, y~x means that we want to plot y as a function of x: y=f(x).
with(cats,cor(Bwt,Hwt))
## [1] 0.8041274
with(cats,cor(Bwt,Hwt))^2
## [1] 0.6466209
with(cats,cor.test(Bwt,Hwt))
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7375682 0.8552122
## sample estimates:
## cor
## 0.8041274
with(cats,cor.test(Bwt,Hwt,alternative = "greater",conf.level = .8))
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is greater than 0
## 80 percent confidence interval:
## 0.7776141 1.0000000
## sample estimates:
## cor
## 0.8041274
cor.test( ~Bwt +Hwt, data = cats)
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 16.119, df = 142, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7375682 0.8552122
## sample estimates:
## cor
## 0.8041274
cor.test(~ Bwt +Hwt, data = cats,subset = (Sex =="F"))
##
## Pearson's product-moment correlation
##
## data: Bwt and Hwt
## t = 4.2152, df = 45, p-value = 0.0001186
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2890452 0.7106399
## sample estimates:
## cor
## 0.5320497
#The standard method that statisticians use to measure the ‘significance’ of their empirical analyses is the p-value.
#For example, that if there are 100 pairs of data whose correlation coefficient is 0.254, then the p-value is 0.01. This means that there is a 1 in 100 chance that we would have seen these observations if the variables were unrelated.
#95 percent confidence interval
#Means correlation between 0.7375682 and 0.8552122 since 95% of the time confidence intervals contain the true mean.
with(cats, plot(Bwt, Hwt, type="n", las=1, xlab="Body Weight in kg", ylab="Heart Weight in g",main="Heart Weight vs. Body Weight of Cats"))
with(cats, points(Bwt[Sex=="F"], Hwt[Sex=="F"], pch=16, col="red"))
with(cats, points(Bwt[Sex=="M"], Hwt[Sex=="M"], pch=17, col="blue"))

rm(cats)
#rm() Function
#Used to remove objects. These can be specified successively as character strings, or in the character vector list, or through a combination of both. All objects thus specified will be removed.
data(cement)
str(cement)
## 'data.frame': 13 obs. of 5 variables:
## $ x1: int 7 1 11 11 7 11 3 1 2 21 ...
## $ x2: int 26 29 56 31 52 55 71 31 54 47 ...
## $ x3: int 6 15 8 8 6 9 17 22 18 4 ...
## $ x4: int 60 52 20 47 33 22 6 44 22 26 ...
## $ y : num 78.5 74.3 104.3 87.6 95.9 ...
cor(cement)
## x1 x2 x3 x4 y
## x1 1.0000000 0.2285795 -0.8241338 -0.2454451 0.7307175
## x2 0.2285795 1.0000000 -0.1392424 -0.9729550 0.8162526
## x3 -0.8241338 -0.1392424 1.0000000 0.0295370 -0.5346707
## x4 -0.2454451 -0.9729550 0.0295370 1.0000000 -0.8213050
## y 0.7307175 0.8162526 -0.5346707 -0.8213050 1.0000000
cov(cement)
## x1 x2 x3 x4 y
## x1 34.60256 20.92308 -31.051282 -24.166667 64.66346
## x2 20.92308 242.14103 -13.878205 -253.416667 191.07949
## x3 -31.05128 -13.87821 41.025641 3.166667 -51.51923
## x4 -24.16667 -253.41667 3.166667 280.166667 -206.80833
## y 64.66346 191.07949 -51.519231 -206.808333 226.31359
cov.matr = cov(cement)
cov2cor(cov.matr)
## x1 x2 x3 x4 y
## x1 1.0000000 0.2285795 -0.8241338 -0.2454451 0.7307175
## x2 0.2285795 1.0000000 -0.1392424 -0.9729550 0.8162526
## x3 -0.8241338 -0.1392424 1.0000000 0.0295370 -0.5346707
## x4 -0.2454451 -0.9729550 0.0295370 1.0000000 -0.8213050
## y 0.7307175 0.8162526 -0.5346707 -0.8213050 1.0000000
pairs(cement)

ls()
## [1] "cement" "cov.matr"
rm(cement, cov.matr)
coach1 = c(1,2,3,4,5,6,7,8,9,10)
coach2 = c(4,8,1,5,9,2,10,7,3,6)
#Nominal and numerical data
#Nominal data is data that can be sorted into categories. For example, people can be sorted by hair colour; black, brown, blond, red, other or by gender; male or female. When creating categorising, it's important that each value is only put into one category.
#Nominal data is categorical data where the order of the categories is arbitrary. For example; hair colour black=1, brown=2, blond=3, red=4, other=5
#Numeric variables have values that describe a measurable quantity as a number, like 'how many' or 'how much'. Therefore numeric variables are quantitative variables.
cor(coach1,coach2,method = "spearman")
## [1] 0.1272727
cor.test(coach1,coach2,method = "spearman")
##
## Spearman's rank correlation rho
##
## data: coach1 and coach2
## S = 144, p-value = 0.7329
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1272727
cor(coach1,coach2,method = "kendall")
## [1] 0.1111111
cor.test(coach1,coach2,method = "kendall")
##
## Kendall's rank correlation tau
##
## data: coach1 and coach2
## T = 25, p-value = 0.7275
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.1111111
ls()
## [1] "coach1" "coach2"
rm(coach1,coach2)
data(cats)
cats[12,2] = NA
cats[101,3] = NA
cats[132,2:3] = NA
summary(cats)
## Sex Bwt Hwt
## F:47 Min. :2.000 Min. : 6.30
## M:97 1st Qu.:2.300 1st Qu.: 8.85
## Median :2.700 Median :10.10
## Mean :2.723 Mean :10.62
## 3rd Qu.:3.000 3rd Qu.:12.07
## Max. :3.900 Max. :20.50
## NA's :2 NA's :2
with(cats, cor(Bwt, Hwt))
## [1] NA
with(cats,cov(Bwt,Hwt))
## [1] NA
with(cats,plot(Bwt,Hwt))

with(cats,cor(Bwt,Hwt, use = "pairwise"))
## [1] 0.8066677
rm(cats)
data(cats)
lm(Hwt ~ Bwt, data = cats)
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Coefficients:
## (Intercept) Bwt
## -0.3567 4.0341
lm.out = lm(Hwt~Bwt,data = cats)
lm.out
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Coefficients:
## (Intercept) Bwt
## -0.3567 4.0341
summary(lm.out)
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5694 -0.9634 -0.0921 1.0426 5.1238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3567 0.6923 -0.515 0.607
## Bwt 4.0341 0.2503 16.119 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 142 degrees of freedom
## Multiple R-squared: 0.6466, Adjusted R-squared: 0.6441
## F-statistic: 259.8 on 1 and 142 DF, p-value: < 2.2e-16
options(show.signif.stars = F)
anova(lm.out)
## Analysis of Variance Table
##
## Response: Hwt
## Df Sum Sq Mean Sq F value Pr(>F)
## Bwt 1 548.09 548.09 259.83 < 2.2e-16
## Residuals 142 299.53 2.11
#ANOVA:The analysis of variance is a commonly used method to determine differences between several samples.
plot(Hwt ~ Bwt, data = cats, main="Kitty cat plot")
abline(lm.out, col = "red")

#abline()
#This function is used for add one or more straight lines through the current plot.
par(mfrow = c(2,2))
plot(lm.out)

cats[144,]
## Sex Bwt Hwt
## 144 M 3.9 20.5
lm.out$fitted[144]
## 144
## 15.37618
lm.out$residuals[144]
## 144
## 5.123818
par(mfrow = c(1,1))
plot(cooks.distance(lm.out))

lm.without144 = lm(Hwt ~ Bwt, data=cats, subset=(Hwt<20.5))
lm.without144
##
## Call:
## lm(formula = Hwt ~ Bwt, data = cats, subset = (Hwt < 20.5))
##
## Coefficients:
## (Intercept) Bwt
## 0.118 3.846
rlm(Hwt ~Bwt, data= cats)
## Call:
## rlm(formula = Hwt ~ Bwt, data = cats)
## Converged in 5 iterations
##
## Coefficients:
## (Intercept) Bwt
## -0.1361777 3.9380535
##
## Degrees of freedom: 144 total; 142 residual
## Scale estimate: 1.52
plot(Hwt ~Bwt, data = cats)
lines(lowess(cats$Hwt~ cats$Bwt),col="red")

scatter.smooth(cats$Hwt~cats$Bwt)

scatter.smooth(cats$Hwt ~ cats$Bwt, pch=16, cex=.6)

#Non parametric method
#A statistical method is called non-parametric if it makes no assumption on the population distribution or sample size.
#Technique smoothing
#Smoothing is a statistical technique that helps you to spot trends in noisy data, and especially to compare trends between two or more fluctuating time series.