Exam-Final

\(\textit{1. Use the “sleep.txt” data to answer the following questions. }\)

library(rmarkdown)

## Warning: package 'rmarkdown' was built under R version 3.2.5

setwd("/Users/yusufsultan/rWork")
Sleep.data <- read.csv("sleep.csv")
str(Sleep.data)

## 'data.frame':    54 obs. of  5 variables:
##  $ Species : Factor w/ 54 levels "African elephant       ",..: 32 52 29 54 31 13 49 20 2 50 ...
##  $ BodyWgt : num  0.048 0.104 0.005 3.5 0.023 0.075 0.06 0.12 1 0.9 ...
##  $ BrainWgt: num  0.33 2.5 0.14 3.9 0.4 1.2 1 1 6.6 2.6 ...
##  $ Sleep   : num  12.8 15.8 9.1 19.4 13.2 8.4 10.3 14.4 8.3 13.3 ...
##  $ LifeSpan: num  2 2.3 2.6 3 3.2 3.5 3.5 3.9 4.5 4.5 ...

head(Sleep.data )

##                        Species BodyWgt BrainWgt Sleep LifeSpan
## 1 Musk shrew                     0.048     0.33  12.8      2.0
## 2     Tree shrew                 0.104     2.50  15.8      2.3
## 3 Lesser short-tailed shrew      0.005     0.14   9.1      2.6
## 4   Water opossum                3.500     3.90  19.4      3.0
## 5  Mouse                         0.023     0.40  13.2      3.2
## 6  Eastern American mole         0.075     1.20   8.4      3.5

tail(Sleep.data)

##                        Species BodyWgt BrainWgt Sleep LifeSpan
## 49       Gray seal               85.00      325   6.2       41
## 50   Horse                      521.00      655   2.9       46
## 51        Chimpanzee             52.16      440   9.7       50
## 52      Echidna                   3.00       25   8.6       50
## 53     Asian elephant          2547.00     4603   3.9       69
## 54 Homo sapiens                  62.00     1320   8.0      100

b. The strong asymmetry for all variables except Sleep indicates that a log transformation is appropriate for those variables. Construct a new data frame that contains Sleep, replaces BodyWgt, BrainWgt, LifeSpan by their log-transformed values, and then construct histograms of each variable in this new data frame with all of them on the same graphics page.

require(ggplot2)

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.2.5

head(Sleep.data)

##                        Species BodyWgt BrainWgt Sleep LifeSpan
## 1 Musk shrew                     0.048     0.33  12.8      2.0
## 2     Tree shrew                 0.104     2.50  15.8      2.3
## 3 Lesser short-tailed shrew      0.005     0.14   9.1      2.6
## 4   Water opossum                3.500     3.90  19.4      3.0
## 5  Mouse                         0.023     0.40  13.2      3.2
## 6  Eastern American mole         0.075     1.20   8.4      3.5

ggplot(data = Sleep.data) + geom_histogram(aes(x=BodyWgt),fill=I('#D55E00'))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Sleep.data) + geom_density(aes(x=BodyWgt) , fill=I('#D55E00'))

ggplot(data = Sleep.data) + geom_histogram(aes(x=BrainWgt),fill=I('#009E73'))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Sleep.data) + geom_density(aes(x=BrainWgt) , fill=I('#009E73'))

ggplot(data = Sleep.data) + geom_histogram(aes(x=LifeSpan),fill= I('#5760AB'))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Sleep.data) + geom_density(aes(x=LifeSpan) , fill= I('#5760AB'))

summary(Sleep.data$BodyWgt)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.005    0.548    3.342  213.000   48.200 6654.000

summary(Sleep.data$BrainWgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.14    4.25   19.25  301.70  166.00 5712.00

summary(Sleep.data$LifeSpan)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   6.125  13.850  19.850  27.750 100.000

#log
Log_BodyWgt <- (log10(Sleep.data$BodyWgt))
summary(Log_BodyWgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.3010 -0.2703  0.5240  0.5522  1.6780  3.8230

Log_BrainWgt <- (log10(Sleep.data$BrainWgt))
summary(Log_BodyWgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.3010 -0.2703  0.5240  0.5522  1.6780  3.8230

Log_LifeSpan <- (log10(Sleep.data$LifeSpan))
summary(Log_BodyWgt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.3010 -0.2703  0.5240  0.5522  1.6780  3.8230

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.2.4

P1_BW <- qplot(x=BodyWgt, data = Sleep.data, fill=I('#D55E00'))
P2_BW <- qplot(x=log10(BodyWgt), data = Sleep.data, fill=I('#009E73'))
P3_BW <- qplot(x=sqrt(BodyWgt), data = Sleep.data, fill= I('#7750AB'))
grid.arrange(P1_BW ,P2_BW , P3_BW , ncol=1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Sleep.data) + geom_density(aes(x=Log_BodyWgt) , fill=I('#D55E00'))

P1_BrW <- qplot(x=BrainWgt, data = Sleep.data, fill=I('#D55E00'))
P2_BrW <- qplot(x=log10(BrainWgt), data = Sleep.data, fill=I('#009E73'))
P3_BrW <- qplot(x=sqrt(BrainWgt), data = Sleep.data, fill= I('#7750AB'))
grid.arrange(P1_BrW ,P2_BrW , P3_BrW , ncol=1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Sleep.data) + geom_density(aes(x=Log_BrainWgt) , fill=I('#D55E00'))

P1_LS <- qplot(x=LifeSpan, data = Sleep.data, fill=I('#D55E00'))
P2_LS <- qplot(x=log10(LifeSpan), data = Sleep.data, fill=I('#009E73'))
P3_LS <- qplot(x=sqrt(LifeSpan), data = Sleep.data, fill= I('#7750AB'))
grid.arrange(P1_LS ,P2_LS , P3_LS , ncol=1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Sleep.data) + geom_density(aes(x=Log_LifeSpan) , fill=I('#D55E00'))

c. Plot LifeSpan versus BrainWgt with LifeSpan on the y-axis and include an informative title. Repeat using the log-transformed variables instead. Superimpose lines corresponding to the respective means of the variables for each plot. .

require(lubridate)

## Loading required package: lubridate

## Warning: package 'lubridate' was built under R version 3.2.5

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

ggplot(Sleep.data) +
  aes(x =BrainWgt, y=LifeSpan) + 
  geom_point(colour = '#D55E00',alpha =0.9)

ggplot(Sleep.data) +
         aes(x =Log_BrainWgt, y=Log_LifeSpan) +
         geom_point(colour = '#D55E00',alpha =0.9)

ggplot(Sleep.data) +
         aes(x =BrainWgt, y=LifeSpan , colour =BodyWgt) +
         geom_point()

ggplot(Sleep.data) +
         aes(x =Log_BrainWgt, y=Log_LifeSpan , colour =Log_BodyWgt) +
         geom_point()

d. Obtain and interpret the correlation between LifeSpan and BrainWgt. Repeat for log(LifeSpan) and log(BrainWgt) .

#using cor to calculate correlation
cor(Sleep.data$BrainWgt , Sleep.data$LifeSpan)

## [1] 0.5060302

# calculate each part of correaltion 
BrainWgt_part <- Sleep.data$BrainWgt - mean(Sleep.data$BrainWgt)
LifeSpan_part <- Sleep.data$LifeSpan - mean(Sleep.data$LifeSpan)
nMinusOne <- (nrow(Sleep.data) - 1)
xSD <- sd(Sleep.data$BrainWgt)
ySD <- sd(Sleep.data$LifeSpan)
# use correlation formala 
sum (BrainWgt_part * LifeSpan_part) / (nMinusOne * xSD * ySD)

## [1] 0.5060302

compare with other variables we use cor on a matrix and that only work with numeric variables

cor(Sleep.data[,c(3,4:5)])

##            BrainWgt      Sleep   LifeSpan
## BrainWgt  1.0000000 -0.3570985  0.5060302
## Sleep    -0.3570985  1.0000000 -0.4102024
## LifeSpan  0.5060302 -0.4102024  1.0000000

GGally :: ggpairs(Sleep.data[,c(3,4:5)], parms = list(labeSize =18))

## Warning: replacing previous import by 'utils::capture.output' when loading
## 'GGally'

## Warning: replacing previous import by 'utils::head' when loading 'GGally'

## Warning: replacing previous import by 'utils::installed.packages' when
## loading 'GGally'

## Warning: replacing previous import by 'utils::str' when loading 'GGally'

## Warning in warn_if_args_exist(list(...)): Extra arguments: 'parms' are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.

ggplot(Sleep.data , aes(x=BrainWgt , y= LifeSpan))+
      geom_point() +
      geom_smooth(method = "lm") +
      labs(x="BrainWgt",y="LifeSpan")

ggplot(Sleep.data , aes(x=Log_BrainWgt , y= Log_LifeSpan))+
      geom_point() +
      geom_smooth(method = "lm") +
      labs(x="Log BrainWgt",y="Log LifeSpan")

calculate a regression use lm function

Sleep.data.LM <- lm(LifeSpan ~BrainWgt , data = Sleep.data)
Sleep.data.LM

## 
## Call:
## lm(formula = LifeSpan ~ BrainWgt, data = Sleep.data)
## 
## Coefficients:
## (Intercept)     BrainWgt  
##   16.956369     0.009597

Sleep.data.LMLog <- lm(Log_LifeSpan ~Log_BrainWgt , data = Sleep.data)
Sleep.data.LMLog

## 
## Call:
## lm(formula = Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data)
## 
## Coefficients:
##  (Intercept)  Log_BrainWgt  
##       0.6972        0.3063

summary(Sleep.data.LM)

## 
## Call:
## lm(formula = LifeSpan ~ BrainWgt, data = Sleep.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.175 -11.116  -3.849   8.204  70.376 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.956369   2.331713   7.272 1.82e-09 ***
## BrainWgt     0.009597   0.002268   4.231 9.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.38 on 52 degrees of freedom
## Multiple R-squared:  0.2561, Adjusted R-squared:  0.2418 
## F-statistic:  17.9 on 1 and 52 DF,  p-value: 9.476e-05

summary(Sleep.data.LMLog)

## 
## Call:
## lm(formula = Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45733 -0.15524 -0.02595  0.10079  0.86748 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.69716    0.05722  12.184  < 2e-16 ***
## Log_BrainWgt  0.30632    0.03298   9.289 1.25e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2606 on 52 degrees of freedom
## Multiple R-squared:  0.624,  Adjusted R-squared:  0.6167 
## F-statistic: 86.29 on 1 and 52 DF,  p-value: 1.246e-12

S1 <- ggplot(aes(x=BrainWgt ,y= LifeSpan) , data = Sleep.data) +
      geom_point() +
      geom_hline(yintercept = 0) +
      geom_smooth(method = "loess") +
      labs(x="BrainWgt Values" , y = "Residuals")
S1

S1 +
  geom_point(aes(color = Sleep))

S1_log <- ggplot(aes(x=Log_BrainWgt ,y= Log_LifeSpan) , data = Sleep.data) +
      geom_point() +
      geom_hline(yintercept = 0) +
      geom_smooth(method = "loess") +
      labs(x="Log BrainWgt Values" , y = "Residuals")
S1_log

S1_log +
  geom_point(aes(color = Sleep))

 Sleept.lm <- lm(LifeSpan ~ BrainWgt, data = Sleep.data)
plot(Sleept.lm)

Sleeptlog.lm <- lm(Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data)
plot(Sleept.lm)

coef(Sleept.lm)

##  (Intercept)     BrainWgt 
## 16.956368579  0.009597038

coef(Sleeptlog.lm)

##  (Intercept) Log_BrainWgt 
##    0.6971573    0.3063217

plot(LifeSpan ~ BrainWgt, data = Sleep.data, pch = 16)
abline(coef(Sleep.data))

plot(Log_LifeSpan ~ Log_BrainWgt, data = Sleep.data, pch = 16)
abline(coef(Sleep.data))

ggplot(Sleep.data,aes(sample =Log_LifeSpan)) +
      stat_qq() +
      geom_abline()

ggplot(Sleep.data ,aes(x=Log_LifeSpan)) + 
      geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Sleep.data ,aes(x=Log_BrainWgt)) + 
      geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a. Create and print a SAS dataset or R dataframe named Flour.

require(UsingR)

## Loading required package: UsingR

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 3.2.5

## Loading required package: HistData

## Warning: package 'HistData' was built under R version 3.2.5

## Loading required package: Hmisc

## Warning: package 'Hmisc' was built under R version 3.2.5

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.2.5

## Loading required package: survival

## Warning: package 'survival' was built under R version 3.2.5

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

## 
## Attaching package: 'UsingR'

## The following object is masked from 'package:survival':
## 
##     cancer

Flour <- read.csv("flour.txt")
str(Flour)

## 'data.frame':    15 obs. of  2 variables:
##  $ Weight: num  5050 10249 20000 7420 24685 ...
##  $ NBags : int  100 205 450 150 500 200 150 100 150 500 ...

head(Flour)

##   Weight NBags
## 1   5050   100
## 2  10249   205
## 3  20000   450
## 4   7420   150
## 5  24685   500
## 6  10206   200

tail(Flour)

##    Weight NBags
## 10  24000   500
## 11   4900   100
## 12  14501   300
## 13  28000   600
## 14  17002   400
## 15  16100   400

summary(Flour)

##      Weight          NBags    
##  Min.   : 4900   Min.   :100  
##  1st Qu.: 7244   1st Qu.:150  
##  Median :10249   Median :205  
##  Mean   :13437   Mean   :287  
##  3rd Qu.:18501   3rd Qu.:425  
##  Max.   :28000   Max.   :600

b. Use SAS or R to find the simple linear regression model for predicting NBags from Weight.

ggplot(Flour ,aes(x = Weight ,y = NBags)) +
      geom_point() +
      geom_smooth(method = "lm") +
      labs(x = "Weight" ,y = "NBags")

c. Include the relevant output in your Word file.

d. Use SAS or R to compute the means and standard deviations for Weight and NBags.

mean(Flour$Weight)

## [1] 13437.2

sd(Flour$Weight)

## [1] 7850.551

mean(Flour$NBags)

## [1] 287

sd(Flour$NBags)

## [1] 172.4798

e. For the simple linear regression model, create the residual and normal plots.

Exam-Final

Yusuf Sultan

4/30/2017