library(dplyr)
Loading in the datasets and editing
library(dplyr)
housing_prices <- read.csv("/Users/ronan/Documents/INFSCI 0310/real-estate-sales.csv")
housing_prices <- housing_prices %>% rename(neighborhood = MUNIDESC)
housing_prices$neighborhood <- trimws(housing_prices$neighborhood)
housing_prices$SALEDATE <- as.Date(housing_prices$SALEDATE, format = "%m-%d-%y")
vaccines <- read.csv("/Users/ronan/Documents/INFSCI 0310/covid-vaccine-data.csv")
vaccines <- vaccines %>% rename(neighborhood = neighborhood_municipality)
covid_cases <- read.csv("/Users/ronan/Documents/INFSCI 0310/covid-case-data.csv")
covid_cases <- covid_cases %>% rename(neighborhood = neighborhood_municipality)
Merging datasets into one set
housing_prices <- housing_prices %>%
filter(!is.na(PRICE))
housing_prices_summary <- housing_prices %>%
group_by(neighborhood) %>%
summarize(mean_housing_price = mean(PRICE), na.rm = TRUE)
vaccines_summary <- vaccines %>%
group_by(neighborhood) %>%
summarize(total_vaccinations = bivalent_booster)
covid_cases_summary <- covid_cases %>%
group_by(neighborhood) %>%
summarize(total_covid_cases = infections)
merged_data <- merge(housing_prices_summary, covid_cases_summary, by = "neighborhood")
merged_data <- merge(merged_data, vaccines_summary, by = "neighborhood")
Splitting data into training and test sets
training_set <- merged_data %>% dplyr::sample_frac(0.8)
testing_set <- dplyr::anti_join(merged_data, training_set, by = "neighborhood")
training_mlr <- lm(mean_housing_price ~ total_covid_cases + total_vaccinations, data = training_set)
summary(training_mlr)
Call:
lm(formula = mean_housing_price ~ total_covid_cases + total_vaccinations,
data = training_set)
Residuals:
Min 1Q Median 3Q Max
-162051 -89564 -55667 25356 597788
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 171134.516 20381.731 8.396 7.71e-13 ***
total_covid_cases -7.584 4.719 -1.607 0.1116
total_vaccinations 55.536 32.852 1.690 0.0945 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 153400 on 87 degrees of freedom
Multiple R-squared: 0.03182, Adjusted R-squared: 0.009566
F-statistic: 1.43 on 2 and 87 DF, p-value: 0.2449
wine_quality <- read.csv("/Users/ronan/Documents/INFSCI 0310/winequality-red.csv")
wine_training_set <- wine_quality %>% dplyr::sample_frac(0.8)
wine_testing_set <- dplyr::anti_join(wine_quality, wine_training_set, by = "quality")
wine_mlr <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, data = wine_training_set)
summary(wine_mlr)
Call:
lm(formula = quality ~ fixed.acidity + volatile.acidity + citric.acid +
residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
density + pH + sulphates + alcohol, data = wine_training_set)
Residuals:
Min 1Q Median 3Q Max
-2.3118 -0.3647 -0.0428 0.4517 1.9373
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.050e+01 2.394e+01 0.857 0.391876
fixed.acidity 2.555e-02 2.995e-02 0.853 0.393654
volatile.acidity -1.041e+00 1.317e-01 -7.904 5.85e-15 ***
citric.acid -1.363e-01 1.621e-01 -0.841 0.400541
residual.sugar 1.007e-02 1.761e-02 0.572 0.567421
chlorides -2.172e+00 4.624e-01 -4.697 2.93e-06 ***
free.sulfur.dioxide 5.543e-03 2.414e-03 2.296 0.021810 *
total.sulfur.dioxide -2.955e-03 8.119e-04 -3.639 0.000284 ***
density -1.663e+01 2.447e+01 -0.680 0.496801
pH -4.099e-01 2.205e-01 -1.859 0.063266 .
sulphates 1.025e+00 1.331e-01 7.706 2.61e-14 ***
alcohol 2.876e-01 2.970e-02 9.684 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6437 on 1267 degrees of freedom
Multiple R-squared: 0.3754, Adjusted R-squared: 0.37
F-statistic: 69.23 on 11 and 1267 DF, p-value: < 2.2e-16