library(distill)
library(dplyr)
library(readr)
library(tidyverse)
library(knitr)
library("readxl")
library(alr4)
library(ggplot2)
library(smss)
library(broom)
library(qpcR)

Question 1

(SMSS 14.3, 14.4, merged & modified)

(Data file: house.selling.price.2 from smss R package)

data("house.selling.price.2") 
dim(house.selling.price.2)

[1] 93  5

For the house.selling.price.2 data the tables below show a correlation matrix and a model fit using four predictors of selling price.

With these four predictors,

A. For backward elimination, which variable would be deleted first? Why?

Predictor Beds has the highest p-value and should be deleted first using backward elimination.

B. For forward selection, which variable would be added first? Why?

Predictor Size has a p-value = 0 and should be added first using forward selection since it is known that Size affects price based on high correlation coefficient of 0.899.

C.Why do you think that BEDS has such a large P-value in the multiple regression model, even though it has a substantial correlation with PRICE?

The large p-value for predictor BEDS proves that we failed to reject the null hypothesis and it is NOT statistically significant. The correlation coefficient of 0.590 indicates a moderate positive correlation between BEDS and PRICE.

Large p-value and substantial correlation is possible because of low sample size of 93 observations.

D. Using software with these four predictors, find the model that would be selected using each criterion:R2 ,Adjusted R2, PRESS, AIC,BIC

HW4Q1.lm <-lm(P ~ S +Ba +New, data = house.selling.price.2)
summary(HW4Q1.lm)


Call:
lm(formula = P ~ S + Ba + New, data = house.selling.price.2)

Residuals:
    Min      1Q  Median      3Q     Max 
-34.804  -9.496   0.917   7.931  73.338 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -47.992      8.209  -5.847 8.15e-08 ***
S             62.263      4.335  14.363  < 2e-16 ***
Ba            20.072      5.495   3.653 0.000438 ***
New           18.371      3.761   4.885 4.54e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 16.31 on 89 degrees of freedom
Multiple R-squared:  0.8681,    Adjusted R-squared:  0.8637 
F-statistic: 195.3 on 3 and 89 DF,  p-value: < 2.2e-16

Model1 <-broom::glance(HW4Q1.lm)
kable(Model1)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.8681361	0.8636912	16.31279	195.3127	0	3	-389.5683	789.1366	801.7996	23683.53	89	93

qpcR::PRESS(HW4Q1.lm)

.........10.........20.........30.........40.........50
.........60.........70.........80.........90...

$stat
[1] 27860.05

$residuals
 [1]   8.56463672  -0.03939913 -14.71598185 -27.21082231  84.40060532
 [6]  22.36861006 -35.08530974   6.82244427  15.21170995  -0.51102480
[11]   3.88826737  -9.89311754  28.17496046   6.77600635 -15.95800509
[16]  19.40028822  -4.05570104  -3.66605875 -17.19127450   0.53441460
[21]   1.08005016  22.59699698  -4.94082730   4.26577625  -7.19186400
[26] -16.66072432   5.03817053  -9.51913168  12.68588453 -32.01221384
[31] -17.80422930 -19.82438535   7.00571544   9.12956093   4.26928129
[36]   4.12150932 -22.85909891 -22.85909891 -22.85909891 -10.89668500
[41] -35.85354717  15.08107642  14.24947237  17.92355801  -8.99713028
[46]  -0.67074479   1.36360013   0.83787249  -4.83067593 -11.34598983
[51]  12.55261095  -7.37350222 -11.92917395   4.40174996 -10.00181947
[56]   3.51445921   3.27168316   5.68233212 -19.23631708   5.73458246
[61]  11.80266132  -1.67544951 -14.49966704   0.16479976 -30.20396045
[66]   8.40470609   8.94132534  -7.88286035  -2.08006411  25.41685658
[71] -14.00283697  -6.20668743   1.88916976  21.70704935  19.17475862
[76] -18.58720728   8.22508917  14.45313627   0.43946312  -2.76227404
[81]   1.18971200   4.43389546   2.90665652  -2.47643784   1.32976835
[86]  19.68889493   0.96971063   6.78392946 -38.94641062  12.05882289
[91]  23.58446546  -0.67223759  36.15367379

$P.square
[1] 0.8448823

HW4Q1a.lm <-lm(P ~ S +New, data = house.selling.price.2)
summary(HW4Q1a.lm)


Call:
lm(formula = P ~ S + New, data = house.selling.price.2)

Residuals:
    Min      1Q  Median      3Q     Max 
-47.207  -9.763  -0.091   9.984  76.405 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -26.089      5.977  -4.365 3.39e-05 ***
S             72.575      3.508  20.690  < 2e-16 ***
New           19.587      3.995   4.903 4.16e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.4 on 90 degrees of freedom
Multiple R-squared:  0.8484,    Adjusted R-squared:  0.845 
F-statistic: 251.8 on 2 and 90 DF,  p-value: < 2.2e-16

qpcR::PRESS(HW4Q1a.lm)

.........10.........20.........30.........40.........50
.........60.........70.........80.........90...

$stat
[1] 31066

$residuals
 [1]  -5.37843622   8.02122284 -11.32777134 -11.57833539  87.66277233
 [6]  15.70373259 -48.13451672  -3.26222816   4.46834650 -12.70206532
[11]  -7.14694966  -2.45551793  14.57698204  14.71553576 -11.36487654
[16]   3.52135636   1.11464899   1.39393760 -14.55151878   5.99060812
[21]   6.21290631   5.80504572  -0.62375565  10.21933262  -3.93350494
[26] -14.45062954  10.77312074  -5.61030428  19.07090388 -31.70842424
[31] -16.23266330 -18.67643485  11.57410928  13.58192784   8.12093808
[36]   9.54042514 -22.09514196 -22.09514196 -22.09514196 -10.27827037
[41] -39.23815733  20.21365336  18.67311241  23.23662761  -8.06355620
[46]   1.62301000   3.65438911   3.97178120  -3.31513157 -10.15177555
[51]  15.23927592  -8.22379429 -11.59722121   7.09431789 -11.58453893
[56]   5.99297057   4.24585847   8.04973684 -21.64221168   7.88708259
[61]  13.04306842  -2.41357878 -15.97777778  -0.68700490 -34.43423288
[66]   8.49802178   9.27265552  -9.47118722  -4.30217431  27.46946087
[71] -18.40320776  -9.37331536  -0.44477178  23.96818019  20.20515712
[76] -25.14986929   6.73333333  13.17359356  -2.01138295  -6.49134918
[81]  -3.09569066   0.67753723  -1.16963497  14.62269998  -3.28231853
[86]  15.10401849  -5.83653258  -0.09472978 -51.52790962  28.71524622
[91]  16.85539361  10.62683715  34.34057937

$P.square
[1] 0.8270324

Model2 <-broom::glance(HW4Q1a.lm)
kable(Model2)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.8483699	0.8450003	17.39529	251.7748	0	2	-396.0631	800.1262	810.2566	27233.66	90	93

HW4Q1b.lm <-lm(P ~ S +Be+Ba+New, data = house.selling.price.2)
summary(HW4Q1b.lm)


Call:
lm(formula = P ~ S + Be + Ba + New, data = house.selling.price.2)

Residuals:
    Min      1Q  Median      3Q     Max 
-36.212  -9.546   1.277   9.406  71.953 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -41.795     12.104  -3.453 0.000855 ***
S             64.761      5.630  11.504  < 2e-16 ***
Be            -2.766      3.960  -0.698 0.486763    
Ba            19.203      5.650   3.399 0.001019 ** 
New           18.984      3.873   4.902  4.3e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 16.36 on 88 degrees of freedom
Multiple R-squared:  0.8689,    Adjusted R-squared:  0.8629 
F-statistic: 145.8 on 4 and 88 DF,  p-value: < 2.2e-16

Model3 <-broom::glance(HW4Q1b.lm)
kable(Model3)

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.868863	0.8629022	16.35994	145.7634	0	4	-389.3113	790.6225	805.8181	23552.98	88	93

qpcR::PRESS(HW4Q1b.lm)

.........10.........20.........30.........40.........50
.........60.........70.........80.........90...

$stat
[1] 28390.22

$residuals
 [1]   8.80642475   1.35756431 -14.50175027 -29.04111653  84.23071502
 [6]  20.67569000 -35.35051485   8.11027181  13.67887526   0.07581592
[11]   4.85328472 -11.83695825  26.87348374   8.26886073 -15.48538574
[16]  19.31532622  -3.40940204  -3.04383223 -17.15036945   1.27508747
[21]   1.74294861  22.35707134  -4.49743443   5.16137712  -6.99989072
[26] -14.75005106   5.88131054 -12.57508690  11.10604584 -32.59560238
[31] -18.02319107 -20.15370843   7.54904425   9.64952145   4.62088927
[36]   1.63097572 -23.70869816 -23.70869816 -23.70869816 -11.35464729
[41] -37.89758813  15.81945785  14.77909434  18.73454884  -9.37163361
[46]  -0.70587588   1.32848794   0.65652512  -5.41405416 -12.03102247
[51]  12.61959192  -8.21143765 -10.21858625   4.11858943  -8.37087703
[56]   3.17956757   2.58154628   5.32400952 -20.64063327   5.32602736
[61]  11.52097564   0.08288066 -13.16544457  -0.63681507 -29.48977860
[66]  11.06805978   8.13089578  -6.44748468  -3.24599794  25.33235011
[71] -16.03557507  -4.81181345   3.63613372  21.67212616  18.85145699
[76] -17.99852963   9.98343549  13.63493612  -1.10933099  -1.73965309
[81]   2.11051768   5.51373185   1.00558807  -3.14292693   2.17169862
[86]  21.07738062   1.61706399  11.40050558 -40.52079806  11.63736061
[91]  24.00171202  -0.18925139  34.97852503

$P.square
[1] 0.8419304

E. Explain which model you prefer and why.

Model_selection<- read_excel("HW4q1.xlsx")
kable(Model_selection)

MODEL	R2	adjR2	PRESS	AIC	BIC	AIC/BIC
Model1: Size+Bath+Bed+New	0.8689	0.8629	28390	790.6225	805.8181	0.9811426
Model2: Size+Bath+New	0.8681	0.8637	27860	789.1366	801.7996	0.9842068
Model3: Size+New	0.8484	0.8450	31066	800.1262	810.2566	0.9874973

Model 2 with predictors for Size, Bath and New would be the better model with lowest PRESS, AIC and BIC

Question 2

(Data file: trees from base R) From the documentation: “This data set provides measurements of the diameter, height and volume of timber in 31 felled black cherry trees. Note that the diameter (in inches) is erroneously labelled Girth in the data. It is measured at 4 ft 6 in above the ground.”

Tree volume estimation is a big deal, especially in the lumber industry. Use the trees data to build a basic model of tree volume prediction. In particular,

fit a multiple regression model with the Volume as the outcome and Girth and Height as the explanatory variables Run regression diagnostic plots on the model. Based on the plots, do you think any of the regression assumptions is violated?

data("trees")  
dim(trees)

[1] 31  3

colnames(trees)

[1] "Girth"  "Height" "Volume"

trees_lm <-lm(Volume ~ Girth+Height, data = trees)
summary(trees_lm)


Call:
lm(formula = Volume ~ Girth + Height, data = trees)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.4065 -2.6493 -0.2876  2.2003  8.4847 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -57.9877     8.6382  -6.713 2.75e-07 ***
Girth         4.7082     0.2643  17.816  < 2e-16 ***
Height        0.3393     0.1302   2.607   0.0145 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.882 on 28 degrees of freedom
Multiple R-squared:  0.948, Adjusted R-squared:  0.9442 
F-statistic:   255 on 2 and 28 DF,  p-value: < 2.2e-16

plot(trees[, 1:3], main = "Correlation plot")

par(mfrow = c(1, 1))
plot(trees_lm)

VIOLATIONS OBSERVED:

Residuals vs Fitted:

Assumption: Constant Variance. Violation: Funnel shape

Assumption: Normality. Violation: No Violation

Scale-Location

Violation: Heteroskedasticity

Residuals vs Leverage

Violation: Points outside red dashed lines

Question 3

(inspired by ALR 9.16)

(Data file: florida in alr R package)

In the 2000 election for U.S. president, the counting of votes in Florida was controversial. In Palm Beach County in south Florida, for example, voters used a so-called butterfly ballot. Some believe that the layout of the ballot caused some voters to cast votes for Buchanan when their intended choice was Gore.

The data has variables for the number of votes for each candidate—Gore, Bush, and Buchanan. Run a simple linear regression model where the Buchanan vote is the outcome and the Bush vote is the explanatory variable. Produce the regression diagnostic plots. Is Palm Beach County an outlier based on the diagnostic plots? Why or why not?

data("florida")  
dim(florida)

[1] 67  3

colnames(florida)

[1] "Gore"     "Bush"     "Buchanan"

florida_lm <-lm(Buchanan ~ Bush, data = florida)
summary(florida_lm)


Call:
lm(formula = Buchanan ~ Bush, data = florida)

Residuals:
    Min      1Q  Median      3Q     Max 
-907.50  -46.10  -29.19   12.26 2610.19 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4.529e+01  5.448e+01   0.831    0.409    
Bush        4.917e-03  7.644e-04   6.432 1.73e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 353.9 on 65 degrees of freedom
Multiple R-squared:  0.3889,    Adjusted R-squared:  0.3795 
F-statistic: 41.37 on 1 and 65 DF,  p-value: 1.727e-08

par(mfrow = c(1, 1))
plot(florida_lm)

Based on the diagnostic plots, Palm Beach County is an outlier because it stands out and has a large residual from line that is predicted by regression model

PART 2 (Final Project)

Background: The increasing costs of higher education and competitiveness of admission to top universities have made it difficult for young adults to pursue and obtain college degrees. This study is still evolving and research goal is to prove or disprove how completion of college degrees can affect future employment and earnings. Also looking into comparing data from 2000 and 2019. Data is from all 50 US states. Data is sourced and cleaned in excel file.

Data sources:

https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/

https://www.ers.usda.gov/data-products/county-level-data-sets/

What is your research question for the final project?

Final question not formulated yet but general concept is:

Does education pay?

Considering the obstacles of obtaining higher education, is pursuing college degrees still significant?

What is your hypothesis (i.e. an answer to the research question) that you want to test?

Completing a higher degree education is significant to higher household income and employment status.

Present some exploratory analysis. In particular: Numerically summarize (e.g. with the summary() function) the variables of interest (the outcome, the explanatory variable, the control variables). Plot the relationships between key variables. You can do this any way you want, but one straightforward way of doing this would be with the pairs() function or other scatter plots / box plots. Interpret what you see.

Poster603 <- read_excel("Poster603.xlsx")
colnames(Poster603)

 [1] "FIPScode"           "state"              "areaname"          
 [4] "rt_ue_00"           "hs_00"              "assoc_col_00"      
 [7] "col_plus_00"        "pct_hs_00"          "pct_assoc_col_00"  
[10] "pct_col_plus_00"    "med_hh_inc_00"      "rt_ave_ue_1519"    
[13] "hs_1519"            "assoc_col_1519"     "col_plus_1519"     
[16] "pct_hs_1519"        "pct_assoc_col_1519" "pct_col_plus_1519" 
[19] "med_hh_inc_1519"

dim(Poster603)

[1] 50 19

kable(head(Poster603))

FIPScode	state	areaname	rt_ue_00	hs_00	assoc_col_00	col_plus_00	pct_hs_00	pct_assoc_col_00	pct_col_plus_00	med_hh_inc_00	rt_ave_ue_1519	hs_1519	assoc_col_1519	col_plus_1519	pct_hs_1519	pct_assoc_col_1519	pct_col_plus_1519	med_hh_inc_1519
01000	AL	Alabama	4.6	877216	746495	549608	30.4	25.9	19.0	44667.05	4.70	1022839	993344	845772	30.80027	29.91210	25.46833	51771
02000	AK	Alaska	6.3	105812	135655	93807	27.9	35.7	24.7	67482.77	6.14	134582	169609	142019	28.00373	35.29212	29.55121	77203
05000	AR	Arkansas	4.2	791904	1078521	766212	24.3	33.1	16.7	42111.47	3.98	1129129	1600240	1394526	23.85888	33.81361	29.46681	49020
04000	AZ	Arizona	4.0	590416	424907	288428	34.1	24.5	23.5	53071.81	5.24	684659	593576	463236	34.03489	29.50708	23.02779	62027
06000	CA	California	4.9	4288452	6397739	5669966	20.1	30.0	26.6	62146.54	5.02	5423462	7648680	8980726	20.48790	28.89397	33.92596	80423
08000	CO	Colorado	2.7	644360	861478	907755	23.2	31.0	32.7	61767.06	3.02	817452	1127242	1565134	21.36806	29.46592	40.91234	77104

VARIABLES OF INTEREST:

      "rt_ue_00"   -rate of unemployment 2000
      
      "hs_00"      -# of adults with highschool diploma 2000
      
      "assoc_col_00"    -# of adults with some college or associates degree 2000
      
      "col_plus_00"    -# of adults with bachelors degree or higher 2000
      
      "pct_hs_00"        -% of adults with highschool diploma 2000
      
      "pct_assoc_col_00"  -% of adults with associates or some college 2000
      
      "pct_col_plus_00"  -% of adults with bachelors degree or higher 2000
      
      "med_hh_inc_00"     -median household income 2000
      
      "rt_ave_ue_1519"    -rate of average unemployment 2015-2019
      
      "hs_1519"      -# of adults with highschool diploma 2015-2019
      
      "assoc_col_1519"  -# of adults with some college or associates degree 2015-2019
      
      "col_plus_1519"     # of adults with bachelors degree or higher 2015-2019
      
      "pct_hs_1519"        -% of adults with highschool diploma 2015-2019
      
      "pct_assoc_col_1519" -% of adults with associates or some college 2015-2019
      
      "pct_col_plus_1519" -% of adults with bachelors degree or higher 2015-2019
      
      "med_hh_inc_1519"   -median household income 2015-2019

Data from Year 2000

New_Poster00 <-Poster603 %>%
  dplyr::select("rt_ue_00","med_hh_inc_00","pct_hs_00","pct_assoc_col_00","pct_col_plus_00")
kable(head(New_Poster00))

rt_ue_00	med_hh_inc_00	pct_hs_00	pct_assoc_col_00	pct_col_plus_00
4.6	44667.05	30.4	25.9	19.0
6.3	67482.77	27.9	35.7	24.7
4.2	42111.47	24.3	33.1	16.7
4.0	53071.81	34.1	24.5	23.5
4.9	62146.54	20.1	30.0	26.6
2.7	61767.06	23.2	31.0	32.7

summary(New_Poster00)

    rt_ue_00     med_hh_inc_00     pct_hs_00     pct_assoc_col_00
 Min.   :2.100   Min.   :38858   Min.   :20.10   Min.   :21.00   
 1st Qu.:3.075   1st Qu.:48575   1st Qu.:27.80   1st Qu.:25.52   
 Median :3.800   Median :53310   Median :29.40   Median :27.60   
 Mean   :3.846   Mean   :54136   Mean   :29.95   Mean   :28.23   
 3rd Qu.:4.400   3rd Qu.:60699   3rd Qu.:32.20   3rd Qu.:30.85   
 Max.   :6.300   Max.   :72161   Max.   :39.40   Max.   :37.00   
 pct_col_plus_00
 Min.   :14.80  
 1st Qu.:21.27  
 Median :23.35  
 Mean   :23.78  
 3rd Qu.:26.18  
 Max.   :33.20

pairs(New_Poster00, main ="Figure 1. Education data 2000")

Data from Year 2015-2019

New_Poster1519 <-Poster603 %>%
  dplyr::select("rt_ave_ue_1519","med_hh_inc_1519","pct_hs_1519","pct_assoc_col_1519","pct_col_plus_1519")
head(New_Poster1519)

# A tibble: 6 x 5
  rt_ave_ue_1519 med_hh_inc_1519 pct_hs_1519 pct_assoc_col_1519
           <dbl>           <dbl>       <dbl>              <dbl>
1           4.7            51771        30.8               29.9
2           6.14           77203        28.0               35.3
3           3.98           49020        23.9               33.8
4           5.24           62027        34.0               29.5
5           5.02           80423        20.5               28.9
6           3.02           77104        21.4               29.5
# ... with 1 more variable: pct_col_plus_1519 <dbl>

summary(New_Poster1519)

 rt_ave_ue_1519  med_hh_inc_1519  pct_hs_1519    pct_assoc_col_1519
 Min.   :2.680   Min.   :45928   Min.   :20.49   Min.   :22.92     
 1st Qu.:3.550   1st Qu.:57445   1st Qu.:25.92   1st Qu.:28.05     
 Median :4.230   Median :63279   Median :28.04   Median :29.82     
 Mean   :4.198   Mean   :65121   Mean   :28.23   Mean   :30.04     
 3rd Qu.:4.730   3rd Qu.:73906   3rd Qu.:30.76   3rd Qu.:32.64     
 Max.   :6.140   Max.   :86644   Max.   :40.32   Max.   :36.73     
 pct_col_plus_1519
 Min.   :20.61    
 1st Qu.:27.71    
 Median :30.69    
 Mean   :31.24    
 3rd Qu.:34.15    
 Max.   :43.69

pairs(New_Poster1519,  main = "Figure 2. Education data 2015-2019")

Regression Analysis and Model Evaulations will be conducted for final paper. Based solely on Figure 1 and Figure 2, the initial observations are:

Initial_observations <- read_excel("Initial_obs.xlsx")
kable(head(Initial_observations))

Initial Cor observations:	2000	2015-19
Unemployment & HS	no cor	positive cor
Unemployment & some college/associates	no cor	no cor
Unemployment & college or higher	negative cor	negative cor
Median household income & HS	no cor	negative cor
Median household income & some college/assoc	no cor	no cor
Median household income & college or higher	positive cor	positive cor

Model Evaluation