Linear-Regression-Modeling-Project.R

# Importing Data
library("readxl")
states = read_excel("C:/Users/hp/Documents/datasets/datasets/Global/USA/states.xlsx", sheet = "data")

states$Region = factor(states$Region) # Changes Region into a factor

str(states) # checks the data type per variable

## tibble [51 x 11] (S3: tbl_df/tbl/data.frame)
##  $ State   : chr [1:51] "California" "Texas" "Florida" "New York" ...
##  $ Region  : Factor w/ 4 levels "Midwest","Northeast",..: 4 3 3 2 2 1 1 3 3 1 ...
##  $ Pop     : num [1:51] 39.5 29.1 21.5 20.2 13 ...
##  $ Pcpi    : num [1:51] 66.6 52.8 52.4 71.7 58 ...
##  $ Literacy: num [1:51] 76.9 81 80.3 77.9 87.4 87.1 90.9 83.3 86.4 91.7 ...
##  $ Murder  : num [1:51] 5.6 6.6 5.9 4.2 7.9 9.1 7 8.8 8 7.6 ...
##  $ HSGrad  : num [1:51] 83 84 88 87 91 89 90 87 88 91 ...
##  $ AvgTemp : num [1:51] 59.4 64.8 70.7 45.4 48.8 51.8 50.7 63.5 59 44.4 ...
##  $ Land    : num [1:51] 155.8 261.2 53.6 47.1 44.7 ...
##  $ Poverty : num [1:51] 12.6 14.2 13.3 13.6 11.9 ...
##  $ LifeExp : num [1:51] 81.7 79.2 80.2 81.4 78.4 79.4 77 77.9 78.1 78.1 ...

summary(states) # summary statistics per variable

##     State                 Region        Pop               Pcpi      
##  Length:51          Midwest  :12   Min.   : 0.5768   Min.   :38.91  
##  Class :character   Northeast: 9   1st Qu.: 1.8164   1st Qu.:48.45  
##  Mode  :character   South    :17   Median : 4.5058   Median :53.19  
##                     West     :13   Mean   : 6.4991   Mean   :54.81  
##                                    3rd Qu.: 7.4284   3rd Qu.:59.25  
##                                    Max.   :39.5382   Max.   :83.41  
##     Literacy         Murder           HSGrad        AvgTemp     
##  Min.   :76.90   Min.   : 0.900   Min.   :83.0   Min.   :26.60  
##  1st Qu.:85.80   1st Qu.: 3.550   1st Qu.:87.5   1st Qu.:45.30  
##  Median :89.30   Median : 5.900   Median :90.0   Median :51.70  
##  Mean   :88.32   Mean   : 6.461   Mean   :89.6   Mean   :51.94  
##  3rd Qu.:91.60   3rd Qu.: 7.850   3rd Qu.:92.0   3rd Qu.:58.30  
##  Max.   :94.20   Max.   :28.200   Max.   :94.0   Max.   :70.70  
##       Land            Poverty         LifeExp     
##  Min.   :  0.061   Min.   : 7.42   Min.   :74.80  
##  1st Qu.: 33.334   1st Qu.:10.45   1st Qu.:77.95  
##  Median : 53.625   Median :12.36   Median :79.00  
##  Mean   : 69.253   Mean   :12.60   Mean   :78.76  
##  3rd Qu.: 80.693   3rd Qu.:14.17   3rd Qu.:79.85  
##  Max.   :570.641   Max.   :19.58   Max.   :82.30

par(mfrow = c(1,1))    # Number of plots in rows and columns;


# Histograms and qqplots of the quantitative variables
hist(states$Pop)

qqnorm(states$Pop)

hist(states$Pcpi)

qqnorm(states$Pcpi)

hist(states$Literacy)

qqnorm(states$Literacy)

hist(states$Murder)

qqnorm(states$Murder)

hist(states$HSGrad)

qqnorm(states$HSGrad)

hist(states$AvgTemp)

qqnorm(states$AvgTemp)

# Shapiro tests of Normality
shapiro.test(states$Pop)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$Pop
## W = 0.70992, p-value = 9.956e-09

shapiro.test(states$Pcpi)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$Pcpi
## W = 0.93788, p-value = 0.0101

shapiro.test(states$Literacy)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$Literacy
## W = 0.93827, p-value = 0.01048

shapiro.test(states$Murder)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$Murder
## W = 0.77725, p-value = 2.223e-07

shapiro.test(states$HSGrad)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$HSGrad
## W = 0.94967, p-value = 0.03053

shapiro.test(states$AvgTemp)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$AvgTemp
## W = 0.97568, p-value = 0.3744

shapiro.test(states$Land)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$Land
## W = 0.57291, p-value = 6.024e-11

shapiro.test(states$Poverty)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$Poverty
## W = 0.96352, p-value = 0.1178

shapiro.test(states$LifeExp)

## 
##  Shapiro-Wilk normality test
## 
## data:  states$LifeExp
## W = 0.95847, p-value = 0.07182

# calculate skewness
library(moments)

skewness(states$Pop)

## [1] 2.574231

skewness(states$Pcpi)

## [1] 0.9565852

skewness(states$Literacy)

## [1] -0.7641423

skewness(states$Murder)

## [1] 2.659864

skewness(states$HSGrad)

## [1] -0.468919

skewness(states$AvgTemp)

## [1] -0.01278445

skewness(states$Land)

## [1] 4.248254

skewness(states$Poverty)

## [1] 0.6145787

skewness(states$LifeExp)

## [1] -0.50749

# State count in each region
counts <- sort(table(states$Region), decreasing = TRUE)  # Number of states in each region
percentages <- 100 * counts / length(states$Region)      
barplot(percentages, ylab = "Percentage", col = "lightblue") 
text(x=seq(0.7, 5, 1.2), 2, paste("n=", counts))      # Add count to each bar

# Lollipop plot of the population in each state
library(ggplot2)

ggplot(states, aes(x = State, y = Pop)) +
  geom_point(size = 3, color = "red") + 
  geom_segment(aes(x = State, xend = State, y = 0, yend = Pop)) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) # Rotate axis label

library(ggridges)
ggplot(states, aes(x = LifeExp, y = Region, fill = Region)) +
  geom_density_ridges() +
  theme_ridges() +                                 # No color on backgroud
  theme(legend.position = "none",                  # No show legend
        axis.title.x = element_text(hjust = 0.5),  # x axis title in the center
        axis.title.y = element_text(hjust = 0.5))  # y axis title in the center

## Picking joint bandwidth of 0.64

# Correlation diagram of the numeric variables
st <- states[, 3:11] # Take numeric variables as goal matrix
library(ellipse)

## 
## Attaching package: 'ellipse'

## The following object is masked from 'package:graphics':
## 
##     pairs

library(corrplot)

## corrplot 0.92 loaded

corMatrix <- cor(as.matrix(st)) # Calculate correlation matrix
col <- colorRampPalette(c("red", "yellow", "blue"))  # 3 colors to represent coefficients -1 to 1.
corrplot.mixed(corMatrix, order = "AOE", lower = "number", lower.col = "black", 
               number.cex = .8, upper = "ellipse",  upper.col = col(10), 
               diag = "u", tl.pos = "lt", tl.col = "black") # Mix plots of "number" and "ellipse"

# Cluster dendrogram for state numeric variables
plot(hclust(as.dist(1 - cor(as.matrix(st)))))  # Hierarchical clustering

# density plot showing the distribution of Life Expectancy by region
ggplot(states, aes(x = LifeExp, fill = Region)) + geom_density(alpha = 0.3)

# Box plot of population density by region
states$Pop.Density <- states$Pop*1000/states$Land
boxplot(states$Pop.Density ~ states$Region, xlab = "Region", ylab = "Population Density")

# ANOVA test of population density by region
Pop_Density_model <- aov(states$Pop.Density ~ states$Region, states)
summary(Pop_Density_model)

##               Df    Sum Sq Mean Sq F value Pr(>F)
## states$Region  3   6104667 2034889    0.81  0.495
## Residuals     47 118136528 2513543

# Scatterplot for Poverty rate and Life Expectancy by Per Capita Income
ggplot(states, aes(x = Poverty, y = LifeExp)) + 
  geom_point(aes(size = Pcpi, color = Region)) + 
  geom_smooth(method = 'lm',formula = y ~ x)  # Add regression line

# Regional life expectancy distribution
ggplot(states, aes(x = Region, y = LifeExp, fill = Region)) + 
  geom_violin(trim = FALSE) + 
  geom_boxplot(width = 0.1)

# Relationship between Life expectancy, Pcpi (Per Capita Income), Murder and High school graduation
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# group HSGrad into HSGrad Type first
states.HSGrad <- states %>% mutate(HSGradType = factor(ifelse(HSGrad < 85, "Below 85%",  
                                                        ifelse(HSGrad < 90 & HSGrad >= 85, "85% to 89%",
                                                                             "90% and Above"))))
ggplot(states.HSGrad, aes(x = Pcpi, y = LifeExp)) + 
  geom_point(aes(shape = HSGradType, color = Region, size = Murder)) + 
  geom_smooth(method = 'lm', formula = y ~ x)

# Segment diagram for all states
row.names(st) <- states$State

## Warning: Setting row names on a tibble is deprecated.

stars(st, key.loc = c(13, 1.5), draw.segments = T)

# Heat map for whole states data set
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

st.matrix <- as.matrix(st)    # Transfer the data frame to matrix
s <- apply(st.matrix, 2, function(y)(y - mean(y)) / sd(y))  # Standardize data
a <- heatmap.2(s, 
               col = greenred(75), # Color green red
               density.info = "none", 
               trace = "none", 
               scale = "none", 
               RowSideColors = rainbow(4)[states$Region],  
               srtCol = 45,        # Column labels at 45 degree
               margins = c(5, 8),  # Bottom and right margins
               lhei = c(5, 15)     # Relative heights of the rows
) 
legend("topright", levels(states$Region), fill = rainbow(4), cex = 0.8)  # Add legend

# Principal components analysis
pca = prcomp(st, scale = T)  # scale = T to normalize the data
pca

## Standard deviations (1, .., p=9):
## [1] 1.9731245 1.4269138 1.0777428 0.9806669 0.5806738 0.5312239 0.3947277
## [8] 0.3265542 0.2561853
## 
## Rotation (n x k) = (9 x 9):
##                  PC1         PC2         PC3         PC4        PC5         PC6
## Pop      -0.18705996  0.56404801  0.14271039 -0.10615718  0.5391009 -0.52681443
## Pcpi      0.24934170  0.40640236 -0.25362863  0.54389400  0.1190293  0.26068925
## Literacy  0.36836299 -0.41140835 -0.05309948  0.04385402  0.1976381 -0.48314379
## Murder   -0.29394486 -0.10896532 -0.16455540  0.75728074 -0.1347623 -0.34383275
## HSGrad    0.46613787 -0.17073853 -0.05689959  0.03665375 -0.1520769 -0.21109791
## AvgTemp  -0.40059982  0.12791925 -0.32740830 -0.22865850 -0.5833470 -0.34507771
## Land      0.04316697  0.09869514  0.86778066  0.21852523 -0.3582590 -0.10965568
## Poverty  -0.44673072 -0.21515307  0.11283273  0.11937585  0.1824353  0.34277533
## LifeExp   0.31866277  0.48374595 -0.09591646 -0.04336590 -0.3360466  0.09848113
##                  PC7         PC8        PC9
## Pop       0.13240651 -0.11178633 -0.1305065
## Pcpi     -0.42934554  0.06887728 -0.3759111
## Literacy -0.04889506  0.61077117 -0.2058841
## Murder    0.25733519 -0.06935722  0.3061972
## HSGrad    0.29091258 -0.64266295 -0.4285409
## AvgTemp  -0.23657185  0.08603238 -0.3754136
## Land     -0.16014913  0.07373900 -0.1270442
## Poverty   0.43284798  0.17024334 -0.6000344
## LifeExp   0.61312097  0.38738923  0.0689186

plot(pca)     # Plot the amount of variance each principal components captures

summary(pca)  # Shows the importance of the components

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.9731 1.4269 1.0777 0.9807 0.58067 0.53122 0.39473
## Proportion of Variance 0.4326 0.2262 0.1291 0.1069 0.03746 0.03136 0.01731
## Cumulative Proportion  0.4326 0.6588 0.7879 0.8947 0.93219 0.96355 0.98086
##                            PC8     PC9
## Standard deviation     0.32655 0.25619
## Proportion of Variance 0.01185 0.00729
## Cumulative Proportion  0.99271 1.00000

percentVar <- round(100 * summary(pca)$importance[2, 1:8], 0) # Compute % variances
percentVar

## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 
##  43  23  13  11   4   3   2   1

# Biplot for Principal Component Analysis
library(ggfortify)
row.names(states) <- states$State

## Warning: Setting row names on a tibble is deprecated.

autoplot(prcomp(st,  scale = T), data = states, 
         colour = 'Region', shape = FALSE, label = TRUE, label.size = 3.5,
         loadings = TRUE, loadings.colour = 'blue', loadings.label = TRUE, 
         loadings.label.size = 4, loadings.label.colour = 'blue')

# Multiple Linear Regression Analysis
mlr <- states[, c(2:11)]
mlr <- within(mlr, Region <- relevel(Region, ref = "South"))  # Set region South as reference
model <- lm(LifeExp ~ .,  data = mlr)
summary(model)

## 
## Call:
## lm(formula = LifeExp ~ ., data = mlr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.44533 -0.48242  0.04609  0.48279  1.31713 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     68.9551637 12.3513676   5.583 1.97e-06 ***
## RegionMidwest    1.1841058  0.4237574   2.794 0.008024 ** 
## RegionNortheast  0.8399497  0.5283958   1.590 0.119995    
## RegionWest       1.6913434  0.3956275   4.275 0.000119 ***
## Pop              0.0296121  0.0267562   1.107 0.275187    
## Pcpi             0.0783096  0.0235900   3.320 0.001962 ** 
## Literacy        -0.1062576  0.0666764  -1.594 0.119092    
## Murder          -0.0555570  0.0486810  -1.141 0.260727    
## HSGrad           0.1615505  0.1079281   1.497 0.142486    
## AvgTemp          0.0475700  0.0272905   1.743 0.089197 .  
## Land            -0.0004367  0.0018132  -0.241 0.810939    
## Poverty         -0.2152140  0.1089782  -1.975 0.055397 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7717 on 39 degrees of freedom
## Multiple R-squared:  0.8533, Adjusted R-squared:  0.8119 
## F-statistic: 20.62 on 11 and 39 DF,  p-value: 6.197e-13

summary.aov(model) # testing the significance of predictors

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Region       3  66.19  22.062  37.043 1.69e-11 ***
## Pop          1  13.54  13.537  22.729 2.60e-05 ***
## Pcpi         1  31.16  31.163  52.325 1.03e-08 ***
## Literacy     1   0.11   0.114   0.192  0.66402    
## Murder       1  11.60  11.598  19.474 7.82e-05 ***
## HSGrad       1   4.16   4.164   6.991  0.01174 *  
## AvgTemp      1   6.03   6.029  10.123  0.00287 ** 
## Land         1   0.00   0.003   0.006  0.93974    
## Poverty      1   2.32   2.323   3.900  0.05540 .  
## Residuals   39  23.23   0.596                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

par(mfrow = c(2,2))
plot(model) # Post regression model diagnostics

# Prediction using the model
LifeExp_predict = data.frame(Region = "Northeast", Pop = 15, Pcpi = 60, Literacy = 88, Murder = 20, HSGrad = 81, AvgTemp = 30, Land = 25, Poverty = 7)
predict(model, LifeExp_predict)

##        1 
## 77.47134

# The Minimal Adequate Model
step(model)

## Start:  AIC=-16.11
## LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad + 
##     AvgTemp + Land + Poverty
## 
##            Df Sum of Sq    RSS      AIC
## - Land      1    0.0345 23.262 -18.0358
## - Pop       1    0.7295 23.957 -16.5345
## - Murder    1    0.7757 24.003 -16.4363
## <none>                  23.227 -16.1116
## - HSGrad    1    1.3344 24.562 -15.2628
## - Literacy  1    1.5125 24.740 -14.8942
## - AvgTemp   1    1.8096 25.037 -14.2855
## - Poverty   1    2.3227 25.550 -13.2508
## - Pcpi      1    6.5631 29.790  -5.4199
## - Region    3   12.6580 35.885   0.0735
## 
## Step:  AIC=-18.04
## LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad + 
##     AvgTemp + Poverty
## 
##            Df Sum of Sq    RSS      AIC
## - Pop       1    0.6951 23.957 -18.5343
## - Murder    1    0.8057 24.067 -18.2993
## <none>                  23.262 -18.0358
## - HSGrad    1    1.3777 24.639 -17.1013
## - Literacy  1    1.4797 24.741 -16.8907
## - Poverty   1    2.2916 25.553 -15.2440
## - AvgTemp   1    2.9902 26.252 -13.8684
## - Pcpi      1    6.8617 30.123  -6.8526
## - Region    3   12.9000 36.162  -1.5351
## 
## Step:  AIC=-18.53
## LifeExp ~ Region + Pcpi + Literacy + Murder + HSGrad + AvgTemp + 
##     Poverty
## 
##            Df Sum of Sq    RSS      AIC
## - HSGrad    1    0.8571 24.814 -18.7416
## - Murder    1    0.9052 24.862 -18.6427
## <none>                  23.957 -18.5343
## - AvgTemp   1    2.6189 26.576 -15.2433
## - Poverty   1    3.5528 27.510 -13.4819
## - Literacy  1    3.6697 27.626 -13.2656
## - Pcpi      1    7.2262 31.183  -7.0897
## - Region    3   13.7224 37.679  -1.4386
## 
## Step:  AIC=-18.74
## LifeExp ~ Region + Pcpi + Literacy + Murder + AvgTemp + Poverty
## 
##            Df Sum of Sq    RSS      AIC
## - Murder    1    0.7107 25.525 -19.3015
## <none>                  24.814 -18.7416
## - AvgTemp   1    2.0489 26.863 -16.6954
## - Literacy  1    2.8338 27.648 -15.2266
## - Poverty   1    6.8088 31.623  -8.3755
## - Pcpi      1    6.8654 31.679  -8.2843
## - Region    3   14.4697 39.284  -1.3121
## 
## Step:  AIC=-19.3
## LifeExp ~ Region + Pcpi + Literacy + AvgTemp + Poverty
## 
##            Df Sum of Sq    RSS      AIC
## <none>                  25.525 -19.3015
## - AvgTemp   1    1.6701 27.195 -18.0691
## - Literacy  1    4.5208 30.045 -12.9850
## - Pcpi      1    8.1631 33.688  -7.1494
## - Region    3   16.2769 41.801  -0.1436
## - Poverty   1   20.8232 46.348   9.1217

## 
## Call:
## lm(formula = LifeExp ~ Region + Pcpi + Literacy + AvgTemp + Poverty, 
##     data = mlr)
## 
## Coefficients:
##     (Intercept)    RegionMidwest  RegionNortheast       RegionWest  
##        88.11368          1.46394          1.20908          1.84938  
##            Pcpi         Literacy          AvgTemp          Poverty  
##         0.06084         -0.12240          0.03585         -0.37876

model1 = update(model, .~.-Land)
summary(model1)

## 
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + 
##     HSGrad + AvgTemp + Poverty, data = mlr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.42813 -0.50471  0.02614  0.50928  1.30017 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     68.31672   11.92065   5.731 1.13e-06 ***
## RegionMidwest    1.20103    0.41294   2.908  0.00590 ** 
## RegionNortheast  0.87775    0.49858   1.760  0.08597 .  
## RegionWest       1.68013    0.38822   4.328 9.79e-05 ***
## Pop              0.02829    0.02588   1.093  0.28082    
## Pcpi             0.07916    0.02305   3.435  0.00139 ** 
## Literacy        -0.10436    0.06543  -1.595  0.11856    
## Murder          -0.05645    0.04796  -1.177  0.24614    
## HSGrad           0.16363    0.10631   1.539  0.13163    
## AvgTemp          0.05117    0.02257   2.268  0.02883 *  
## Poverty         -0.21302    0.10731  -1.985  0.05402 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7626 on 40 degrees of freedom
## Multiple R-squared:  0.8531, Adjusted R-squared:  0.8164 
## F-statistic: 23.23 on 10 and 40 DF,  p-value: 1.275e-13

anova(model, model1) # Compare the two models

## Analysis of Variance Table
## 
## Model 1: LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad + 
##     AvgTemp + Land + Poverty
## Model 2: LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad + 
##     AvgTemp + Poverty
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1     39 23.227                          
## 2     40 23.262 -1 -0.034546 0.058 0.8109

model2 = update(model1, .~.-Literacy)
summary(model2)

## 
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + Murder + HSGrad + 
##     AvgTemp + Poverty, data = mlr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5047 -0.5882  0.1189  0.4833  1.2337 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     60.72177   11.13239   5.455 2.57e-06 ***
## RegionMidwest    0.97126    0.39422   2.464 0.018035 *  
## RegionNortheast  0.86504    0.50782   1.703 0.096055 .  
## RegionWest       1.82085    0.38512   4.728 2.68e-05 ***
## Pop              0.04947    0.02263   2.187 0.034538 *  
## Pcpi             0.08989    0.02245   4.003 0.000256 ***
## Murder          -0.06905    0.04819  -1.433 0.159469    
## HSGrad           0.12324    0.10518   1.172 0.248053    
## AvgTemp          0.06526    0.02115   3.086 0.003631 ** 
## Poverty         -0.16219    0.10438  -1.554 0.127922    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7768 on 41 degrees of freedom
## Multiple R-squared:  0.8437, Adjusted R-squared:  0.8094 
## F-statistic:  24.6 on 9 and 41 DF,  p-value: 8.142e-14

model3 = update(model2, .~.-HSGrad)
summary(model3)

## 
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + Murder + AvgTemp + 
##     Poverty, data = mlr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3797 -0.6313  0.1668  0.5279  1.2404 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     73.50934    2.20886  33.279  < 2e-16 ***
## RegionMidwest    1.07513    0.38583   2.787 0.007963 ** 
## RegionNortheast  0.90787    0.50874   1.785 0.081561 .  
## RegionWest       1.84955    0.38605   4.791 2.09e-05 ***
## Pop              0.03346    0.01811   1.847 0.071751 .  
## Pcpi             0.08654    0.02237   3.869 0.000375 ***
## Murder          -0.06139    0.04796  -1.280 0.207527    
## AvgTemp          0.05464    0.01920   2.847 0.006806 ** 
## Poverty         -0.24116    0.08006  -3.012 0.004377 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7803 on 42 degrees of freedom
## Multiple R-squared:  0.8385, Adjusted R-squared:  0.8078 
## F-statistic: 27.26 on 8 and 42 DF,  p-value: 2.877e-14

model4 = update(model3, .~.-Murder)
summary(model4)

## 
## Call:
## lm(formula = LifeExp ~ Region + Pop + Pcpi + AvgTemp + Poverty, 
##     data = mlr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41656 -0.57294  0.05116  0.58856  1.34932 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     75.06407    1.85864  40.387  < 2e-16 ***
## RegionMidwest    1.13912    0.38541   2.956 0.005048 ** 
## RegionNortheast  1.25846    0.43191   2.914 0.005647 ** 
## RegionWest       2.02763    0.36277   5.589 1.44e-06 ***
## Pop              0.04096    0.01726   2.372 0.022213 *  
## Pcpi             0.06663    0.01620   4.114 0.000172 ***
## AvgTemp          0.05200    0.01923   2.705 0.009758 ** 
## Poverty         -0.31212    0.05819  -5.364 3.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.786 on 43 degrees of freedom
## Multiple R-squared:  0.8322, Adjusted R-squared:  0.8049 
## F-statistic: 30.47 on 7 and 43 DF,  p-value: 1.088e-14

anova(model, model4)

## Analysis of Variance Table
## 
## Model 1: LifeExp ~ Region + Pop + Pcpi + Literacy + Murder + HSGrad + 
##     AvgTemp + Land + Poverty
## Model 2: LifeExp ~ Region + Pop + Pcpi + AvgTemp + Poverty
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     39 23.227                           
## 2     43 26.568 -4   -3.3405 1.4022 0.2512

Linear-Regression-Modeling-Project.R

hp

2022-05-11