# Importing Data
library("readxl")
states = read_excel("C:/Users/hp/Documents/datasets/datasets/Global/USA/states.xlsx", sheet = "data")
states$Region = factor(states$Region) # Changes Region into a factor
str(states) # checks the data type per variable
## tibble [51 x 11] (S3: tbl_df/tbl/data.frame)
## $ State : chr [1:51] "California" "Texas" "Florida" "New York" ...
## $ Region : Factor w/ 4 levels "Midwest","Northeast",..: 4 3 3 2 2 1 1 3 3 1 ...
## $ Pop : num [1:51] 39.5 29.1 21.5 20.2 13 ...
## $ Pcpi : num [1:51] 66.6 52.8 52.4 71.7 58 ...
## $ Literacy: num [1:51] 76.9 81 80.3 77.9 87.4 87.1 90.9 83.3 86.4 91.7 ...
## $ Murder : num [1:51] 5.6 6.6 5.9 4.2 7.9 9.1 7 8.8 8 7.6 ...
## $ HSGrad : num [1:51] 83 84 88 87 91 89 90 87 88 91 ...
## $ AvgTemp : num [1:51] 59.4 64.8 70.7 45.4 48.8 51.8 50.7 63.5 59 44.4 ...
## $ Land : num [1:51] 155.8 261.2 53.6 47.1 44.7 ...
## $ Poverty : num [1:51] 12.6 14.2 13.3 13.6 11.9 ...
## $ LifeExp : num [1:51] 81.7 79.2 80.2 81.4 78.4 79.4 77 77.9 78.1 78.1 ...
summary(states) # summary statistics per variable
## State Region Pop Pcpi
## Length:51 Midwest :12 Min. : 0.5768 Min. :38.91
## Class :character Northeast: 9 1st Qu.: 1.8164 1st Qu.:48.45
## Mode :character South :17 Median : 4.5058 Median :53.19
## West :13 Mean : 6.4991 Mean :54.81
## 3rd Qu.: 7.4284 3rd Qu.:59.25
## Max. :39.5382 Max. :83.41
## Literacy Murder HSGrad AvgTemp
## Min. :76.90 Min. : 0.900 Min. :83.0 Min. :26.60
## 1st Qu.:85.80 1st Qu.: 3.550 1st Qu.:87.5 1st Qu.:45.30
## Median :89.30 Median : 5.900 Median :90.0 Median :51.70
## Mean :88.32 Mean : 6.461 Mean :89.6 Mean :51.94
## 3rd Qu.:91.60 3rd Qu.: 7.850 3rd Qu.:92.0 3rd Qu.:58.30
## Max. :94.20 Max. :28.200 Max. :94.0 Max. :70.70
## Land Poverty LifeExp
## Min. : 0.061 Min. : 7.42 Min. :74.80
## 1st Qu.: 33.334 1st Qu.:10.45 1st Qu.:77.95
## Median : 53.625 Median :12.36 Median :79.00
## Mean : 69.253 Mean :12.60 Mean :78.76
## 3rd Qu.: 80.693 3rd Qu.:14.17 3rd Qu.:79.85
## Max. :570.641 Max. :19.58 Max. :82.30
par(mfrow = c(1,1)) # Number of plots in rows and columns;
# Histograms and qqplots of the quantitative variables
hist(states$Pop)

qqnorm(states$Pop)

hist(states$Pcpi)

qqnorm(states$Pcpi)

hist(states$Literacy)

qqnorm(states$Literacy)

hist(states$Murder)

qqnorm(states$Murder)

hist(states$HSGrad)

qqnorm(states$HSGrad)

hist(states$AvgTemp)

qqnorm(states$AvgTemp)

# Shapiro tests of Normality
shapiro.test(states$Pop)
##
## Shapiro-Wilk normality test
##
## data: states$Pop
## W = 0.70992, p-value = 9.956e-09
shapiro.test(states$Pcpi)
##
## Shapiro-Wilk normality test
##
## data: states$Pcpi
## W = 0.93788, p-value = 0.0101
shapiro.test(states$Literacy)
##
## Shapiro-Wilk normality test
##
## data: states$Literacy
## W = 0.93827, p-value = 0.01048
shapiro.test(states$Murder)
##
## Shapiro-Wilk normality test
##
## data: states$Murder
## W = 0.77725, p-value = 2.223e-07
shapiro.test(states$HSGrad)
##
## Shapiro-Wilk normality test
##
## data: states$HSGrad
## W = 0.94967, p-value = 0.03053
shapiro.test(states$AvgTemp)
##
## Shapiro-Wilk normality test
##
## data: states$AvgTemp
## W = 0.97568, p-value = 0.3744
shapiro.test(states$Land)
##
## Shapiro-Wilk normality test
##
## data: states$Land
## W = 0.57291, p-value = 6.024e-11
shapiro.test(states$Poverty)
##
## Shapiro-Wilk normality test
##
## data: states$Poverty
## W = 0.96352, p-value = 0.1178
shapiro.test(states$LifeExp)
##
## Shapiro-Wilk normality test
##
## data: states$LifeExp
## W = 0.95847, p-value = 0.07182
# calculate skewness
library(moments)
skewness(states$Pop)
## [1] 2.574231
skewness(states$Pcpi)
## [1] 0.9565852
skewness(states$Literacy)
## [1] -0.7641423
skewness(states$Murder)
## [1] 2.659864
skewness(states$HSGrad)
## [1] -0.468919
skewness(states$AvgTemp)
## [1] -0.01278445
skewness(states$Land)
## [1] 4.248254
skewness(states$Poverty)
## [1] 0.6145787
skewness(states$LifeExp)
## [1] -0.50749
# State count in each region
counts <- sort(table(states$Region), decreasing = TRUE) # Number of states in each region
percentages <- 100 * counts / length(states$Region)
barplot(percentages, ylab = "Percentage", col = "lightblue")
text(x=seq(0.7, 5, 1.2), 2, paste("n=", counts)) # Add count to each bar
# Lollipop plot of the population in each state
library(ggplot2)

ggplot(states, aes(x = State, y = Pop)) +
geom_point(size = 3, color = "red") +
geom_segment(aes(x = State, xend = State, y = 0, yend = Pop)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) # Rotate axis label

library(ggridges)
ggplot(states, aes(x = LifeExp, y = Region, fill = Region)) +
geom_density_ridges() +
theme_ridges() + # No color on backgroud
theme(legend.position = "none", # No show legend
axis.title.x = element_text(hjust = 0.5), # x axis title in the center
axis.title.y = element_text(hjust = 0.5)) # y axis title in the center
## Picking joint bandwidth of 0.64

# Correlation diagram of the numeric variables
st <- states[, 3:11] # Take numeric variables as goal matrix
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
library(corrplot)
## corrplot 0.92 loaded
corMatrix <- cor(as.matrix(st)) # Calculate correlation matrix
col <- colorRampPalette(c("red", "yellow", "blue")) # 3 colors to represent coefficients -1 to 1.
corrplot.mixed(corMatrix, order = "AOE", lower = "number", lower.col = "black",
number.cex = .8, upper = "ellipse", upper.col = col(10),
diag = "u", tl.pos = "lt", tl.col = "black") # Mix plots of "number" and "ellipse"

# Cluster dendrogram for state numeric variables
plot(hclust(as.dist(1 - cor(as.matrix(st))))) # Hierarchical clustering

# density plot showing the distribution of Life Expectancy by region
ggplot(states, aes(x = LifeExp, fill = Region)) + geom_density(alpha = 0.3)

# Box plot of population density by region
states$Pop.Density <- states$Pop*1000/states$Land
boxplot(states$Pop.Density ~ states$Region, xlab = "Region", ylab = "Population Density")

# ANOVA test of population density by region
Pop_Density_model <- aov(states$Pop.Density ~ states$Region, states)
summary(Pop_Density_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## states$Region 3 6104667 2034889 0.81 0.495
## Residuals 47 118136528 2513543
# Scatterplot for Poverty rate and Life Expectancy by Per Capita Income
ggplot(states, aes(x = Poverty, y = LifeExp)) +
geom_point(aes(size = Pcpi, color = Region)) +
geom_smooth(method = 'lm',formula = y ~ x) # Add regression line

# Regional life expectancy distribution
ggplot(states, aes(x = Region, y = LifeExp, fill = Region)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.1)

# Relationship between Life expectancy, Pcpi (Per Capita Income), Murder and High school graduation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# group HSGrad into HSGrad Type first
states.HSGrad <- states %>% mutate(HSGradType = factor(ifelse(HSGrad < 85, "Below 85%",
ifelse(HSGrad < 90 & HSGrad >= 85, "85% to 89%",
"90% and Above"))))
ggplot(states.HSGrad, aes(x = Pcpi, y = LifeExp)) +
geom_point(aes(shape = HSGradType, color = Region, size = Murder)) +
geom_smooth(method = 'lm', formula = y ~ x)

# Segment diagram for all states
row.names(st) <- states$State
## Warning: Setting row names on a tibble is deprecated.
stars(st, key.loc = c(13, 1.5), draw.segments = T)

# Heat map for whole states data set
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
st.matrix <- as.matrix(st) # Transfer the data frame to matrix
s <- apply(st.matrix, 2, function(y)(y - mean(y)) / sd(y)) # Standardize data
a <- heatmap.2(s,
col = greenred(75), # Color green red
density.info = "none",
trace = "none",
scale = "none",
RowSideColors = rainbow(4)[states$Region],
srtCol = 45, # Column labels at 45 degree
margins = c(5, 8), # Bottom and right margins
lhei = c(5, 15) # Relative heights of the rows
)
legend("topright", levels(states$Region), fill = rainbow(4), cex = 0.8) # Add legend

# Principal components analysis
pca = prcomp(st, scale = T) # scale = T to normalize the data
pca
## Standard deviations (1, .., p=9):
## [1] 1.9731245 1.4269138 1.0777428 0.9806669 0.5806738 0.5312239 0.3947277
## [8] 0.3265542 0.2561853
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5 PC6
## Pop -0.18705996 0.56404801 0.14271039 -0.10615718 0.5391009 -0.52681443
## Pcpi 0.24934170 0.40640236 -0.25362863 0.54389400 0.1190293 0.26068925
## Literacy 0.36836299 -0.41140835 -0.05309948 0.04385402 0.1976381 -0.48314379
## Murder -0.29394486 -0.10896532 -0.16455540 0.75728074 -0.1347623 -0.34383275
## HSGrad 0.46613787 -0.17073853 -0.05689959 0.03665375 -0.1520769 -0.21109791
## AvgTemp -0.40059982 0.12791925 -0.32740830 -0.22865850 -0.5833470 -0.34507771
## Land 0.04316697 0.09869514 0.86778066 0.21852523 -0.3582590 -0.10965568
## Poverty -0.44673072 -0.21515307 0.11283273 0.11937585 0.1824353 0.34277533
## LifeExp 0.31866277 0.48374595 -0.09591646 -0.04336590 -0.3360466 0.09848113
## PC7 PC8 PC9
## Pop 0.13240651 -0.11178633 -0.1305065
## Pcpi -0.42934554 0.06887728 -0.3759111
## Literacy -0.04889506 0.61077117 -0.2058841
## Murder 0.25733519 -0.06935722 0.3061972
## HSGrad 0.29091258 -0.64266295 -0.4285409
## AvgTemp -0.23657185 0.08603238 -0.3754136
## Land -0.16014913 0.07373900 -0.1270442
## Poverty 0.43284798 0.17024334 -0.6000344
## LifeExp 0.61312097 0.38738923 0.0689186
plot(pca) # Plot the amount of variance each principal components captures

summary(pca) # Shows the importance of the components
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9731 1.4269 1.0777 0.9807 0.58067 0.53122 0.39473
## Proportion of Variance 0.4326 0.2262 0.1291 0.1069 0.03746 0.03136 0.01731
## Cumulative Proportion 0.4326 0.6588 0.7879 0.8947 0.93219 0.96355 0.98086
## PC8 PC9
## Standard deviation 0.32655 0.25619
## Proportion of Variance 0.01185 0.00729
## Cumulative Proportion 0.99271 1.00000
percentVar <- round(100 * summary(pca)$importance[2, 1:8], 0) # Compute % variances
percentVar
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 43 23 13 11 4 3 2 1
# Biplot for Principal Component Analysis
library(ggfortify)
row.names(states) <- states$State
## Warning: Setting row names on a tibble is deprecated.
autoplot(prcomp(st, scale = T), data = states,
colour = 'Region', shape = FALSE, label = TRUE, label.size = 3.5,
loadings = TRUE, loadings.colour = 'blue', loadings.label = TRUE,
loadings.label.size = 4, loadings.label.colour = 'blue')

# Multiple Linear Regression Analysis
mlr <- states[, c(2:11)]
mlr <- within(mlr, Region <- relevel(Region, ref = "South")) # Set region South as reference
model <- lm(LifeExp ~ ., data = mlr)
summary(model)
##
## Call:
## lm(formula = LifeExp ~ ., data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.44533 -0.48242 0.04609 0.48279 1.31713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.9551637 12.3513676 5.583 1.97e-06 ***
## RegionMidwest 1.1841058 0.4237574 2.794 0.008024 **
## RegionNortheast 0.8399497 0.5283958 1.590 0.119995
## RegionWest 1.6913434 0.3956275 4.275 0.000119 ***
## Pop 0.0296121 0.0267562 1.107 0.275187
## Pcpi 0.0783096 0.0235900 3.320 0.001962 **
## Literacy -0.1062576 0.0666764 -1.594 0.119092
## Murder -0.0555570 0.0486810 -1.141 0.260727
## HSGrad 0.1615505 0.1079281 1.497 0.142486
## AvgTemp 0.0475700 0.0272905 1.743 0.089197 .
## Land -0.0004367 0.0018132 -0.241 0.810939
## Poverty -0.2152140 0.1089782 -1.975 0.055397 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7717 on 39 degrees of freedom
## Multiple R-squared: 0.8533, Adjusted R-squared: 0.8119
## F-statistic: 20.62 on 11 and 39 DF, p-value: 6.197e-13
summary.aov(model) # testing the significance of predictors
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 3 66.19 22.062 37.043 1.69e-11 ***
## Pop 1 13.54 13.537 22.729 2.60e-05 ***
## Pcpi 1 31.16 31.163 52.325 1.03e-08 ***
## Literacy 1 0.11 0.114 0.192 0.66402
## Murder 1 11.60 11.598 19.474 7.82e-05 ***
## HSGrad 1 4.16 4.164 6.991 0.01174 *
## AvgTemp 1 6.03 6.029 10.123 0.00287 **
## Land 1 0.00 0.003 0.006 0.93974
## Poverty 1 2.32 2.323 3.900 0.05540 .
## Residuals 39 23.23 0.596
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2,2))
plot(model) # Post regression model diagnostics

# Prediction using the model
LifeExp_predict = data.frame(Region = "Northeast", Pop = 15, Pcpi = 60, Literacy = 88, Murder = 20, HSGrad = 81, AvgTemp = 30, Land = 25, Poverty = 7)
predict(model, LifeExp_predict)
## 1
## 77.47134
# The Minimal Adequate Model
step(model)
## Start: AIC=-16.11
## LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad +
## AvgTemp + Land + Poverty
##
## Df Sum of Sq RSS AIC
## - Land 1 0.0345 23.262 -18.0358
## - Pop 1 0.7295 23.957 -16.5345
## - Murder 1 0.7757 24.003 -16.4363
## <none> 23.227 -16.1116
## - HSGrad 1 1.3344 24.562 -15.2628
## - Literacy 1 1.5125 24.740 -14.8942
## - AvgTemp 1 1.8096 25.037 -14.2855
## - Poverty 1 2.3227 25.550 -13.2508
## - Pcpi 1 6.5631 29.790 -5.4199
## - Region 3 12.6580 35.885 0.0735
##
## Step: AIC=-18.04
## LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad +
## AvgTemp + Poverty
##
## Df Sum of Sq RSS AIC
## - Pop 1 0.6951 23.957 -18.5343
## - Murder 1 0.8057 24.067 -18.2993
## <none> 23.262 -18.0358
## - HSGrad 1 1.3777 24.639 -17.1013
## - Literacy 1 1.4797 24.741 -16.8907
## - Poverty 1 2.2916 25.553 -15.2440
## - AvgTemp 1 2.9902 26.252 -13.8684
## - Pcpi 1 6.8617 30.123 -6.8526
## - Region 3 12.9000 36.162 -1.5351
##
## Step: AIC=-18.53
## LifeExp ~ Region + Pcpi + Literacy + Murder + HSGrad + AvgTemp +
## Poverty
##
## Df Sum of Sq RSS AIC
## - HSGrad 1 0.8571 24.814 -18.7416
## - Murder 1 0.9052 24.862 -18.6427
## <none> 23.957 -18.5343
## - AvgTemp 1 2.6189 26.576 -15.2433
## - Poverty 1 3.5528 27.510 -13.4819
## - Literacy 1 3.6697 27.626 -13.2656
## - Pcpi 1 7.2262 31.183 -7.0897
## - Region 3 13.7224 37.679 -1.4386
##
## Step: AIC=-18.74
## LifeExp ~ Region + Pcpi + Literacy + Murder + AvgTemp + Poverty
##
## Df Sum of Sq RSS AIC
## - Murder 1 0.7107 25.525 -19.3015
## <none> 24.814 -18.7416
## - AvgTemp 1 2.0489 26.863 -16.6954
## - Literacy 1 2.8338 27.648 -15.2266
## - Poverty 1 6.8088 31.623 -8.3755
## - Pcpi 1 6.8654 31.679 -8.2843
## - Region 3 14.4697 39.284 -1.3121
##
## Step: AIC=-19.3
## LifeExp ~ Region + Pcpi + Literacy + AvgTemp + Poverty
##
## Df Sum of Sq RSS AIC
## <none> 25.525 -19.3015
## - AvgTemp 1 1.6701 27.195 -18.0691
## - Literacy 1 4.5208 30.045 -12.9850
## - Pcpi 1 8.1631 33.688 -7.1494
## - Region 3 16.2769 41.801 -0.1436
## - Poverty 1 20.8232 46.348 9.1217
##
## Call:
## lm(formula = LifeExp ~ Region + Pcpi + Literacy + AvgTemp + Poverty,
## data = mlr)
##
## Coefficients:
## (Intercept) RegionMidwest RegionNortheast RegionWest
## 88.11368 1.46394 1.20908 1.84938
## Pcpi Literacy AvgTemp Poverty
## 0.06084 -0.12240 0.03585 -0.37876
model1 = update(model, .~.-Land)
summary(model1)
##
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + Literacy + Murder +
## HSGrad + AvgTemp + Poverty, data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.42813 -0.50471 0.02614 0.50928 1.30017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.31672 11.92065 5.731 1.13e-06 ***
## RegionMidwest 1.20103 0.41294 2.908 0.00590 **
## RegionNortheast 0.87775 0.49858 1.760 0.08597 .
## RegionWest 1.68013 0.38822 4.328 9.79e-05 ***
## Pop 0.02829 0.02588 1.093 0.28082
## Pcpi 0.07916 0.02305 3.435 0.00139 **
## Literacy -0.10436 0.06543 -1.595 0.11856
## Murder -0.05645 0.04796 -1.177 0.24614
## HSGrad 0.16363 0.10631 1.539 0.13163
## AvgTemp 0.05117 0.02257 2.268 0.02883 *
## Poverty -0.21302 0.10731 -1.985 0.05402 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7626 on 40 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8164
## F-statistic: 23.23 on 10 and 40 DF, p-value: 1.275e-13
anova(model, model1) # Compare the two models
## Analysis of Variance Table
##
## Model 1: LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad +
## AvgTemp + Land + Poverty
## Model 2: LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad +
## AvgTemp + Poverty
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 39 23.227
## 2 40 23.262 -1 -0.034546 0.058 0.8109
model2 = update(model1, .~.-Literacy)
summary(model2)
##
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + Murder + HSGrad +
## AvgTemp + Poverty, data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5047 -0.5882 0.1189 0.4833 1.2337
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.72177 11.13239 5.455 2.57e-06 ***
## RegionMidwest 0.97126 0.39422 2.464 0.018035 *
## RegionNortheast 0.86504 0.50782 1.703 0.096055 .
## RegionWest 1.82085 0.38512 4.728 2.68e-05 ***
## Pop 0.04947 0.02263 2.187 0.034538 *
## Pcpi 0.08989 0.02245 4.003 0.000256 ***
## Murder -0.06905 0.04819 -1.433 0.159469
## HSGrad 0.12324 0.10518 1.172 0.248053
## AvgTemp 0.06526 0.02115 3.086 0.003631 **
## Poverty -0.16219 0.10438 -1.554 0.127922
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7768 on 41 degrees of freedom
## Multiple R-squared: 0.8437, Adjusted R-squared: 0.8094
## F-statistic: 24.6 on 9 and 41 DF, p-value: 8.142e-14
model3 = update(model2, .~.-HSGrad)
summary(model3)
##
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + Murder + AvgTemp +
## Poverty, data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3797 -0.6313 0.1668 0.5279 1.2404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.50934 2.20886 33.279 < 2e-16 ***
## RegionMidwest 1.07513 0.38583 2.787 0.007963 **
## RegionNortheast 0.90787 0.50874 1.785 0.081561 .
## RegionWest 1.84955 0.38605 4.791 2.09e-05 ***
## Pop 0.03346 0.01811 1.847 0.071751 .
## Pcpi 0.08654 0.02237 3.869 0.000375 ***
## Murder -0.06139 0.04796 -1.280 0.207527
## AvgTemp 0.05464 0.01920 2.847 0.006806 **
## Poverty -0.24116 0.08006 -3.012 0.004377 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7803 on 42 degrees of freedom
## Multiple R-squared: 0.8385, Adjusted R-squared: 0.8078
## F-statistic: 27.26 on 8 and 42 DF, p-value: 2.877e-14
model4 = update(model3, .~.-Murder)
summary(model4)
##
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + AvgTemp + Poverty,
## data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41656 -0.57294 0.05116 0.58856 1.34932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.06407 1.85864 40.387 < 2e-16 ***
## RegionMidwest 1.13912 0.38541 2.956 0.005048 **
## RegionNortheast 1.25846 0.43191 2.914 0.005647 **
## RegionWest 2.02763 0.36277 5.589 1.44e-06 ***
## Pop 0.04096 0.01726 2.372 0.022213 *
## Pcpi 0.06663 0.01620 4.114 0.000172 ***
## AvgTemp 0.05200 0.01923 2.705 0.009758 **
## Poverty -0.31212 0.05819 -5.364 3.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.786 on 43 degrees of freedom
## Multiple R-squared: 0.8322, Adjusted R-squared: 0.8049
## F-statistic: 30.47 on 7 and 43 DF, p-value: 1.088e-14
anova(model, model4)
## Analysis of Variance Table
##
## Model 1: LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad +
## AvgTemp + Land + Poverty
## Model 2: LifeExp ~ Region + Pop + Pcpi + AvgTemp + Poverty
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 39 23.227
## 2 43 26.568 -4 -3.3405 1.4022 0.2512