R Markdown

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.3
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.3
## Warning: package 'tibble' was built under R version 3.3.3
## Warning: package 'tidyr' was built under R version 3.3.3
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'purrr' was built under R version 3.3.3
## Warning: package 'dplyr' was built under R version 3.3.3
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
dataset<-read.table("https://onlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/skincancer.txt",header=T)

head(dataset)
##         State  Lat Mort Ocean  Long
## 1     Alabama 33.0  219     1  87.0
## 2     Arizona 34.5  160     0 112.0
## 3    Arkansas 35.0  170     0  92.5
## 4  California 37.5  182     1 119.5
## 5    Colorado 39.0  149     0 105.5
## 6 Connecticut 41.8  159     1  72.8

Visualizing mortality vs lat

plot(dataset$Mort~dataset$Lat)

# It is clear that there is a strong linear relationship between 'Lat' and 'Mort'
model_lm<-lm(dataset$Mort~dataset$Lat)
# Check summary of the model

summary(model_lm)
## 
## Call:
## lm(formula = dataset$Mort ~ dataset$Lat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.972 -13.185   0.972  12.006  43.938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 389.1894    23.8123   16.34  < 2e-16 ***
## dataset$Lat  -5.9776     0.5984   -9.99 3.31e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared:  0.6798, Adjusted R-squared:  0.673 
## F-statistic:  99.8 on 1 and 47 DF,  p-value: 3.309e-13
par(mfrow=c(2,2))

plot(model_lm)

* The ‘fitted values vs residual’ plot shows that the error of residual is relatively equal. Normal QQ plot also shows no pattern of non-linearity, while residuals vs leverage plot displays no possible significant outliers.

# Calculate CI for the model

confint(model_lm,level = 0.95)
##                  2.5 %     97.5 %
## (Intercept) 341.285151 437.093552
## dataset$Lat  -7.181404  -4.773867
# calculate prediction CIs for a specific number, lat=38.5

predict(model_lm,data.frame(Lat=38.5),interval = "prediction")
## Warning: 'newdata' had 1 row but variables found have 49 rows
##         fit       lwr      upr
## 1  191.9274 152.29449 231.5602
## 2  182.9609 143.64646 222.2754
## 3  179.9721 140.74588 219.1983
## 4  165.0280 126.10613 203.9499
## 5  156.0616 117.21137 194.9117
## 6  139.3242 100.38352 178.2648
## 7  156.0616 117.21137 194.9117
## 8  156.0616 117.21137 194.9117
## 9  221.8156 180.56445 263.0667
## 10 191.9274 152.29449 231.5602
## 11 123.1846  83.88214 162.4870
## 12 150.0839 111.23496 188.9329
## 13 148.8884 110.03520 187.7416
## 14 136.9331  97.95575 175.9105
## 15 159.0504 120.18560 197.9152
## 16 163.2347 124.33388 202.1356
## 17 202.6871 162.56807 242.8062
## 18 119.0002  79.56080 158.4396
## 19 156.0616 117.21137 194.9117
## 20 136.9331  97.95575 175.9105
## 21 129.1622  90.02483 168.2996
## 22 114.2181  74.60075 153.8355
## 23 193.1229 153.44155 232.8042
## 24 159.0504 120.18560 197.9152
## 25 108.2405  68.36909 148.1119
## 26 141.1175 102.20045 180.0345
## 27 156.0616 117.21137 194.9117
## 28 127.3689  88.18583 166.5520
## 29 148.8884 110.03520 187.7416
## 30 179.9721 140.74588 219.1983
## 31 132.1510  93.08252 171.2195
## 32 176.9833 137.83624 216.1303
## 33 105.2517  65.24028 145.2630
## 34 148.8884 110.03520 187.7416
## 35 176.9833 137.83624 216.1303
## 36 126.1734  86.95802 165.3887
## 37 145.3018 106.42698 184.1766
## 38 139.3242 100.38352 178.2648
## 39 187.1453 147.69217 226.5984
## 40 121.3913  82.03229 160.7503
## 41 173.9945 134.91750 213.0714
## 42 200.8938 160.86354 240.9241
## 43 153.0727 114.22783 191.9176
## 44 126.1734  86.95802 165.3887
## 45 165.0280 126.10613 203.9499
## 46 105.2517  65.24028 145.2630
## 47 157.2571 118.40218 196.1120
## 48 123.1846  83.88214 162.4870
## 49 132.1510  93.08252 171.2195
anova(model_lm)
## Analysis of Variance Table
## 
## Response: dataset$Mort
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## dataset$Lat  1  36464   36464  99.797 3.309e-13 ***
## Residuals   47  17173     365                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1