\(\textit{1. Use the “sleep.txt” data to answer the following questions. }\)
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 3.2.5
setwd("/Users/yusufsultan/rWork")
Sleep.data <- read.csv("sleep.csv")
str(Sleep.data)
## 'data.frame': 54 obs. of 5 variables:
## $ Species : Factor w/ 54 levels "African elephant ",..: 32 52 29 54 31 13 49 20 2 50 ...
## $ BodyWgt : num 0.048 0.104 0.005 3.5 0.023 0.075 0.06 0.12 1 0.9 ...
## $ BrainWgt: num 0.33 2.5 0.14 3.9 0.4 1.2 1 1 6.6 2.6 ...
## $ Sleep : num 12.8 15.8 9.1 19.4 13.2 8.4 10.3 14.4 8.3 13.3 ...
## $ LifeSpan: num 2 2.3 2.6 3 3.2 3.5 3.5 3.9 4.5 4.5 ...
head(Sleep.data )
## Species BodyWgt BrainWgt Sleep LifeSpan
## 1 Musk shrew 0.048 0.33 12.8 2.0
## 2 Tree shrew 0.104 2.50 15.8 2.3
## 3 Lesser short-tailed shrew 0.005 0.14 9.1 2.6
## 4 Water opossum 3.500 3.90 19.4 3.0
## 5 Mouse 0.023 0.40 13.2 3.2
## 6 Eastern American mole 0.075 1.20 8.4 3.5
tail(Sleep.data)
## Species BodyWgt BrainWgt Sleep LifeSpan
## 49 Gray seal 85.00 325 6.2 41
## 50 Horse 521.00 655 2.9 46
## 51 Chimpanzee 52.16 440 9.7 50
## 52 Echidna 3.00 25 8.6 50
## 53 Asian elephant 2547.00 4603 3.9 69
## 54 Homo sapiens 62.00 1320 8.0 100
b. The strong asymmetry for all variables except Sleep indicates that a log transformation is appropriate for those variables. Construct a new data frame that contains Sleep, replaces BodyWgt, BrainWgt, LifeSpan by their log-transformed values, and then construct histograms of each variable in this new data frame with all of them on the same graphics page.
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
head(Sleep.data)
## Species BodyWgt BrainWgt Sleep LifeSpan
## 1 Musk shrew 0.048 0.33 12.8 2.0
## 2 Tree shrew 0.104 2.50 15.8 2.3
## 3 Lesser short-tailed shrew 0.005 0.14 9.1 2.6
## 4 Water opossum 3.500 3.90 19.4 3.0
## 5 Mouse 0.023 0.40 13.2 3.2
## 6 Eastern American mole 0.075 1.20 8.4 3.5
ggplot(data = Sleep.data) + geom_histogram(aes(x=BodyWgt),fill=I('#D55E00'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Sleep.data) + geom_density(aes(x=BodyWgt) , fill=I('#D55E00'))
ggplot(data = Sleep.data) + geom_histogram(aes(x=BrainWgt),fill=I('#009E73'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Sleep.data) + geom_density(aes(x=BrainWgt) , fill=I('#009E73'))
ggplot(data = Sleep.data) + geom_histogram(aes(x=LifeSpan),fill= I('#5760AB'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Sleep.data) + geom_density(aes(x=LifeSpan) , fill= I('#5760AB'))
summary(Sleep.data$BodyWgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.005 0.548 3.342 213.000 48.200 6654.000
summary(Sleep.data$BrainWgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.14 4.25 19.25 301.70 166.00 5712.00
summary(Sleep.data$LifeSpan)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 6.125 13.850 19.850 27.750 100.000
#log
Log_BodyWgt <- (log10(Sleep.data$BodyWgt))
summary(Log_BodyWgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3010 -0.2703 0.5240 0.5522 1.6780 3.8230
Log_BrainWgt <- (log10(Sleep.data$BrainWgt))
summary(Log_BodyWgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3010 -0.2703 0.5240 0.5522 1.6780 3.8230
Log_LifeSpan <- (log10(Sleep.data$LifeSpan))
summary(Log_BodyWgt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3010 -0.2703 0.5240 0.5522 1.6780 3.8230
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.2.4
P1_BW <- qplot(x=BodyWgt, data = Sleep.data, fill=I('#D55E00'))
P2_BW <- qplot(x=log10(BodyWgt), data = Sleep.data, fill=I('#009E73'))
P3_BW <- qplot(x=sqrt(BodyWgt), data = Sleep.data, fill= I('#7750AB'))
grid.arrange(P1_BW ,P2_BW , P3_BW , ncol=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Sleep.data) + geom_density(aes(x=Log_BodyWgt) , fill=I('#D55E00'))
P1_BrW <- qplot(x=BrainWgt, data = Sleep.data, fill=I('#D55E00'))
P2_BrW <- qplot(x=log10(BrainWgt), data = Sleep.data, fill=I('#009E73'))
P3_BrW <- qplot(x=sqrt(BrainWgt), data = Sleep.data, fill= I('#7750AB'))
grid.arrange(P1_BrW ,P2_BrW , P3_BrW , ncol=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Sleep.data) + geom_density(aes(x=Log_BrainWgt) , fill=I('#D55E00'))
P1_LS <- qplot(x=LifeSpan, data = Sleep.data, fill=I('#D55E00'))
P2_LS <- qplot(x=log10(LifeSpan), data = Sleep.data, fill=I('#009E73'))
P3_LS <- qplot(x=sqrt(LifeSpan), data = Sleep.data, fill= I('#7750AB'))
grid.arrange(P1_LS ,P2_LS , P3_LS , ncol=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Sleep.data) + geom_density(aes(x=Log_LifeSpan) , fill=I('#D55E00'))
c. Plot LifeSpan versus BrainWgt with LifeSpan on the y-axis and include an informative title. Repeat using the log-transformed variables instead. Superimpose lines corresponding to the respective means of the variables for each plot. .
require(lubridate)
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.2.5
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
ggplot(Sleep.data) +
aes(x =BrainWgt, y=LifeSpan) +
geom_point(colour = '#D55E00',alpha =0.9)
ggplot(Sleep.data) +
aes(x =Log_BrainWgt, y=Log_LifeSpan) +
geom_point(colour = '#D55E00',alpha =0.9)
ggplot(Sleep.data) +
aes(x =BrainWgt, y=LifeSpan , colour =BodyWgt) +
geom_point()
ggplot(Sleep.data) +
aes(x =Log_BrainWgt, y=Log_LifeSpan , colour =Log_BodyWgt) +
geom_point()
d. Obtain and interpret the correlation between LifeSpan and BrainWgt. Repeat for log(LifeSpan) and log(BrainWgt) .
#using cor to calculate correlation
cor(Sleep.data$BrainWgt , Sleep.data$LifeSpan)
## [1] 0.5060302
# calculate each part of correaltion
BrainWgt_part <- Sleep.data$BrainWgt - mean(Sleep.data$BrainWgt)
LifeSpan_part <- Sleep.data$LifeSpan - mean(Sleep.data$LifeSpan)
nMinusOne <- (nrow(Sleep.data) - 1)
xSD <- sd(Sleep.data$BrainWgt)
ySD <- sd(Sleep.data$LifeSpan)
# use correlation formala
sum (BrainWgt_part * LifeSpan_part) / (nMinusOne * xSD * ySD)
## [1] 0.5060302
compare with other variables we use cor on a matrix and that only work with numeric variables
cor(Sleep.data[,c(3,4:5)])
## BrainWgt Sleep LifeSpan
## BrainWgt 1.0000000 -0.3570985 0.5060302
## Sleep -0.3570985 1.0000000 -0.4102024
## LifeSpan 0.5060302 -0.4102024 1.0000000
GGally :: ggpairs(Sleep.data[,c(3,4:5)], parms = list(labeSize =18))
## Warning: replacing previous import by 'utils::capture.output' when loading
## 'GGally'
## Warning: replacing previous import by 'utils::head' when loading 'GGally'
## Warning: replacing previous import by 'utils::installed.packages' when
## loading 'GGally'
## Warning: replacing previous import by 'utils::str' when loading 'GGally'
## Warning in warn_if_args_exist(list(...)): Extra arguments: 'parms' are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
ggplot(Sleep.data , aes(x=BrainWgt , y= LifeSpan))+
geom_point() +
geom_smooth(method = "lm") +
labs(x="BrainWgt",y="LifeSpan")
ggplot(Sleep.data , aes(x=Log_BrainWgt , y= Log_LifeSpan))+
geom_point() +
geom_smooth(method = "lm") +
labs(x="Log BrainWgt",y="Log LifeSpan")
calculate a regression use lm function
Sleep.data.LM <- lm(LifeSpan ~BrainWgt , data = Sleep.data)
Sleep.data.LM
##
## Call:
## lm(formula = LifeSpan ~ BrainWgt, data = Sleep.data)
##
## Coefficients:
## (Intercept) BrainWgt
## 16.956369 0.009597
Sleep.data.LMLog <- lm(Log_LifeSpan ~Log_BrainWgt , data = Sleep.data)
Sleep.data.LMLog
##
## Call:
## lm(formula = Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data)
##
## Coefficients:
## (Intercept) Log_BrainWgt
## 0.6972 0.3063
summary(Sleep.data.LM)
##
## Call:
## lm(formula = LifeSpan ~ BrainWgt, data = Sleep.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.175 -11.116 -3.849 8.204 70.376
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.956369 2.331713 7.272 1.82e-09 ***
## BrainWgt 0.009597 0.002268 4.231 9.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.38 on 52 degrees of freedom
## Multiple R-squared: 0.2561, Adjusted R-squared: 0.2418
## F-statistic: 17.9 on 1 and 52 DF, p-value: 9.476e-05
summary(Sleep.data.LMLog)
##
## Call:
## lm(formula = Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45733 -0.15524 -0.02595 0.10079 0.86748
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.69716 0.05722 12.184 < 2e-16 ***
## Log_BrainWgt 0.30632 0.03298 9.289 1.25e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2606 on 52 degrees of freedom
## Multiple R-squared: 0.624, Adjusted R-squared: 0.6167
## F-statistic: 86.29 on 1 and 52 DF, p-value: 1.246e-12
S1 <- ggplot(aes(x=BrainWgt ,y= LifeSpan) , data = Sleep.data) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(method = "loess") +
labs(x="BrainWgt Values" , y = "Residuals")
S1
S1 +
geom_point(aes(color = Sleep))
S1_log <- ggplot(aes(x=Log_BrainWgt ,y= Log_LifeSpan) , data = Sleep.data) +
geom_point() +
geom_hline(yintercept = 0) +
geom_smooth(method = "loess") +
labs(x="Log BrainWgt Values" , y = "Residuals")
S1_log
S1_log +
geom_point(aes(color = Sleep))
Sleept.lm <- lm(LifeSpan ~ BrainWgt, data = Sleep.data)
plot(Sleept.lm)
Sleeptlog.lm <- lm(Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data)
plot(Sleept.lm)
coef(Sleept.lm)
## (Intercept) BrainWgt
## 16.956368579 0.009597038
coef(Sleeptlog.lm)
## (Intercept) Log_BrainWgt
## 0.6971573 0.3063217
plot(LifeSpan ~ BrainWgt, data = Sleep.data, pch = 16)
abline(coef(Sleep.data))
plot(Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data, pch = 16)
abline(coef(Sleep.data))
ggplot(Sleep.data,aes(sample =Log_LifeSpan)) +
stat_qq() +
geom_abline()
ggplot(Sleep.data ,aes(x=Log_LifeSpan)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Sleep.data ,aes(x=Log_BrainWgt)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
a. Create and print a SAS dataset or R dataframe named Flour.
require(UsingR)
## Loading required package: UsingR
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.2.5
## Loading required package: HistData
## Warning: package 'HistData' was built under R version 3.2.5
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.2.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.2.5
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.2.5
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
Flour <- read.csv("flour.txt")
str(Flour)
## 'data.frame': 15 obs. of 2 variables:
## $ Weight: num 5050 10249 20000 7420 24685 ...
## $ NBags : int 100 205 450 150 500 200 150 100 150 500 ...
head(Flour)
## Weight NBags
## 1 5050 100
## 2 10249 205
## 3 20000 450
## 4 7420 150
## 5 24685 500
## 6 10206 200
tail(Flour)
## Weight NBags
## 10 24000 500
## 11 4900 100
## 12 14501 300
## 13 28000 600
## 14 17002 400
## 15 16100 400
summary(Flour)
## Weight NBags
## Min. : 4900 Min. :100
## 1st Qu.: 7244 1st Qu.:150
## Median :10249 Median :205
## Mean :13437 Mean :287
## 3rd Qu.:18501 3rd Qu.:425
## Max. :28000 Max. :600
b. Use SAS or R to find the simple linear regression model for predicting NBags from Weight.
ggplot(Flour ,aes(x = Weight ,y = NBags)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Weight" ,y = "NBags")
c. Include the relevant output in your Word file.
d. Use SAS or R to compute the means and standard deviations for Weight and NBags.
mean(Flour$Weight)
## [1] 13437.2
sd(Flour$Weight)
## [1] 7850.551
mean(Flour$NBags)
## [1] 287
sd(Flour$NBags)
## [1] 172.4798
e. For the simple linear regression model, create the residual and normal plots.