Using R, build a multiple regression model for data that interests you. Include in this model at least one quadratic term, one dichotomous term, and one dichotomous vs. quantitative interaction term. Interpret all coefficients. Conduct residual analysis. Was the linear model appropriate? Why or why not?
data = read.csv("/Users/Michele/Desktop/Egypt - Sheet2.csv")
data = data[c(1:5, 9:17, 21:24)]
colnames(data) <- c("year", "gdp", "total_fish_production", "capture_fish", "aquaculture", "population_largest_city", "percent_65_survival_male", "percent_65_survival_female", "birth_rate", "life_expectancy_male", "life_expectancy", "life_expectancy_female", "infant_mortality", "death_rate", "household_consumption", "consumption_per_capita", "imports_percent", "imports_ten_percent")
colnames(data)
## [1] "year" "gdp"
## [3] "total_fish_production" "capture_fish"
## [5] "aquaculture" "population_largest_city"
## [7] "percent_65_survival_male" "percent_65_survival_female"
## [9] "birth_rate" "life_expectancy_male"
## [11] "life_expectancy" "life_expectancy_female"
## [13] "infant_mortality" "death_rate"
## [15] "household_consumption" "consumption_per_capita"
## [17] "imports_percent" "imports_ten_percent"
library(reshape2)
library(ggplot2)
meltData <- melt(data)
## No id variables; using all as measure variables
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
density <- data %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()
density
summary(lm(gdp ~ total_fish_production + capture_fish + population_largest_city + percent_65_survival_male + percent_65_survival_female + birth_rate + life_expectancy_male + life_expectancy + life_expectancy_female + infant_mortality + death_rate + household_consumption + consumption_per_capita + imports_percent + imports_ten_percent, data = data))
##
## Call:
## lm(formula = gdp ~ total_fish_production + capture_fish + population_largest_city +
## percent_65_survival_male + percent_65_survival_female + birth_rate +
## life_expectancy_male + life_expectancy + life_expectancy_female +
## infant_mortality + death_rate + household_consumption + consumption_per_capita +
## imports_percent + imports_ten_percent, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.138 -17.447 -2.252 16.945 65.948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.481e+04 8.131e+03 1.821 0.07607 .
## total_fish_production 3.455e-04 1.697e-04 2.036 0.04843 *
## capture_fish -3.347e-04 3.966e-04 -0.844 0.40371
## population_largest_city -1.586e-04 1.248e-04 -1.271 0.21098
## percent_65_survival_male 1.681e+02 1.715e+02 0.980 0.33278
## percent_65_survival_female -2.246e+02 1.505e+02 -1.493 0.14339
## birth_rate -1.125e+02 1.738e+02 -0.647 0.52111
## life_expectancy_male 1.800e+04 5.540e+03 3.248 0.00235 **
## life_expectancy -3.365e+04 1.025e+04 -3.285 0.00213 **
## life_expectancy_female 1.568e+04 4.682e+03 3.349 0.00178 **
## infant_mortality -1.448e+01 6.793e+00 -2.132 0.03918 *
## death_rate -1.776e+02 1.434e+02 -1.238 0.22282
## household_consumption -6.989e-10 7.136e-09 -0.098 0.92247
## consumption_per_capita 2.781e-01 4.290e-01 0.648 0.52050
## imports_percent -7.559e-01 1.825e+00 -0.414 0.68088
## imports_ten_percent -2.234e+01 2.253e+01 -0.991 0.32748
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33 on 40 degrees of freedom
## Multiple R-squared: 0.9981, Adjusted R-squared: 0.9974
## F-statistic: 1396 on 15 and 40 DF, p-value: < 2.2e-16
summary(lm(gdp ~ total_fish_production + population_largest_city + percent_65_survival_male + percent_65_survival_female + life_expectancy_male + life_expectancy + life_expectancy_female + infant_mortality + death_rate, data = data))
##
## Call:
## lm(formula = gdp ~ total_fish_production + population_largest_city +
## percent_65_survival_male + percent_65_survival_female + life_expectancy_male +
## life_expectancy + life_expectancy_female + infant_mortality +
## death_rate, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.996 -18.009 -2.603 16.884 75.031
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.770e+04 5.258e+03 3.366 0.00155 **
## total_fish_production 3.208e-04 1.256e-04 2.555 0.01400 *
## population_largest_city -1.370e-04 2.310e-05 -5.930 3.69e-07 ***
## percent_65_survival_male 2.480e+02 1.187e+02 2.090 0.04219 *
## percent_65_survival_female -3.295e+02 1.055e+02 -3.123 0.00309 **
## life_expectancy_male 2.093e+04 3.640e+03 5.751 6.84e-07 ***
## life_expectancy -3.934e+04 6.533e+03 -6.022 2.69e-07 ***
## life_expectancy_female 1.843e+04 2.894e+03 6.371 8.04e-08 ***
## infant_mortality -9.506e+00 3.200e+00 -2.971 0.00471 **
## death_rate -3.238e+02 1.081e+02 -2.997 0.00439 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.25 on 46 degrees of freedom
## Multiple R-squared: 0.9979, Adjusted R-squared: 0.9975
## F-statistic: 2436 on 9 and 46 DF, p-value: < 2.2e-16
Residuals look good with good statistics, multiple linear regression model is valid.
fish = lm(gdp ~ aquaculture + capture_fish, data = data)
summary(fish)
##
## Call:
## lm(formula = gdp ~ aquaculture + capture_fish, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -303.79 -115.32 3.61 123.90 341.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.504e+02 4.658e+01 13.96 < 2e-16 ***
## aquaculture 8.958e-04 8.533e-05 10.50 1.5e-14 ***
## capture_fish 2.797e-03 2.259e-04 12.38 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 162.9 on 53 degrees of freedom
## Multiple R-squared: 0.9384, Adjusted R-squared: 0.9361
## F-statistic: 404 on 2 and 53 DF, p-value: < 2.2e-16
plot(fish)
Again, residuals look good with good statistics, multiple linear regression model is valid.
quadratic_fish = lm(gdp ~ aquaculture + capture_fish + I(capture_fish^2), data = data)
summary(quadratic_fish)
##
## Call:
## lm(formula = gdp ~ aquaculture + capture_fish + I(capture_fish^2),
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -316.79 -66.20 -8.51 75.36 274.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.537e+02 8.199e+01 3.094 0.00318 **
## aquaculture 9.199e-04 6.890e-05 13.351 < 2e-16 ***
## capture_fish 7.551e-03 8.923e-04 8.463 2.38e-11 ***
## I(capture_fish^2) -1.003e-08 1.844e-09 -5.442 1.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 131.3 on 52 degrees of freedom
## Multiple R-squared: 0.9608, Adjusted R-squared: 0.9585
## F-statistic: 424.7 on 3 and 52 DF, p-value: < 2.2e-16
plot(quadratic_fish)
Let’s find different relationships with our dictomous variable
summary(lm(gdp ~ aquaculture + capture_fish + imports_ten_percent, data = data))
##
## Call:
## lm(formula = gdp ~ aquaculture + capture_fish + imports_ten_percent,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -304.59 -119.59 11.98 121.77 337.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.046e+02 9.916e+01 6.097 1.35e-07 ***
## aquaculture 9.024e-04 8.683e-05 10.393 2.70e-14 ***
## capture_fish 2.884e-03 2.819e-04 10.233 4.67e-14 ***
## imports_ten_percent 3.676e+01 7.005e+01 0.525 0.602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 164 on 52 degrees of freedom
## Multiple R-squared: 0.9388, Adjusted R-squared: 0.9352
## F-statistic: 265.8 on 3 and 52 DF, p-value: < 2.2e-16
Again, residuals look good with good statistics, multiple linear regression model is valid. Isn’t it funny, higher life expectancies for women GDP decreases, higher life expectencies for men, GDP increases…
life_expectancy = lm(gdp ~ life_expectancy_male + life_expectancy_female, data = data)
summary(life_expectancy)
##
## Call:
## lm(formula = gdp ~ life_expectancy_male + life_expectancy_female,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -159.237 -95.731 -2.555 92.284 185.163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3643.05 113.53 -32.09 < 2e-16 ***
## life_expectancy_male 530.95 42.44 12.51 < 2e-16 ***
## life_expectancy_female -415.06 39.33 -10.55 1.24e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.1 on 53 degrees of freedom
## Multiple R-squared: 0.9753, Adjusted R-squared: 0.9744
## F-statistic: 1048 on 2 and 53 DF, p-value: < 2.2e-16
plot(life_expectancy)
dic_life_expectancy = lm(gdp ~ life_expectancy_male + life_expectancy_female + imports_ten_percent, data = data)
summary(dic_life_expectancy)
##
## Call:
## lm(formula = gdp ~ life_expectancy_male + life_expectancy_female +
## imports_ten_percent, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -150.354 -87.384 0.892 89.311 193.965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3883.13 182.14 -21.319 < 2e-16 ***
## life_expectancy_male 558.23 44.83 12.451 < 2e-16 ***
## life_expectancy_female -437.49 40.96 -10.682 1.02e-14 ***
## imports_ten_percent 69.81 41.84 1.668 0.101
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101.4 on 52 degrees of freedom
## Multiple R-squared: 0.9766, Adjusted R-squared: 0.9752
## F-statistic: 723.2 on 3 and 52 DF, p-value: < 2.2e-16
plot(dic_life_expectancy)
dic_quad_life_expectancy = (lm(gdp ~ life_expectancy_male + I(life_expectancy_male^2) + life_expectancy_female + I(life_expectancy_female^2) + imports_ten_percent, data = data))
summary(dic_quad_life_expectancy)
##
## Call:
## lm(formula = gdp ~ life_expectancy_male + I(life_expectancy_male^2) +
## life_expectancy_female + I(life_expectancy_female^2) + imports_ten_percent,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -108.932 -37.594 -4.559 32.258 101.210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19670.080 3214.200 -6.120 1.43e-07 ***
## life_expectancy_male -185.623 309.750 -0.599 0.551700
## I(life_expectancy_male^2) 11.099 1.995 5.563 1.04e-06 ***
## life_expectancy_female 776.730 215.001 3.613 0.000702 ***
## I(life_expectancy_female^2) -13.871 1.406 -9.865 2.54e-13 ***
## imports_ten_percent 29.088 22.217 1.309 0.196435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.66 on 50 degrees of freedom
## Multiple R-squared: 0.9939, Adjusted R-squared: 0.9933
## F-statistic: 1638 on 5 and 50 DF, p-value: < 2.2e-16
plot(dic_quad_life_expectancy)