homeprice <-read.csv("homeprice.csv") ##reading csv onto r
library(ggplot2) ##reading ggplot2 from the library
##making scatterplots to show relationships between sale prices and other variablesggplot(homeprice, aes(x = sale, y = list)) +geom_point() +geom_smooth(method ="lm", se =FALSE) +labs(title ="Sale Price vs. List Price") ##data is grouped very closely to line of best fit, indicating there is a strong relationship between sale price and list price
`geom_smooth()` using formula = 'y ~ x'
ggplot(homeprice, aes(x = sale, y = full)) +geom_point() +geom_smooth(method ="lm", se =FALSE) +labs(title ="Sale Price vs. Number of Full bathrooms") ##there is only 3 options in full bathroom which makes the data a little weird, meaning data is not grouped to line of best fit. However, there is still a steady increasing trend that with more full bathrooms, sale price increases showing there is a decent relationship between variables
`geom_smooth()` using formula = 'y ~ x'
ggplot(homeprice, aes(x = sale, y = half)) +geom_point() +geom_smooth(method ="lm", se =FALSE) +labs(title ="Sale Price vs. Number of Half bathrooms") ##once again the data is a bit weird, as ther are only 3 options for half bathrooms, meaning the points aren't grouped around the line of best fit. The slope of the line is not increasing as fast, indicating there is a relationship between sale price and number of half bathrooms, but not a super strong one
`geom_smooth()` using formula = 'y ~ x'
ggplot(homeprice, aes(x = sale, y = bedrooms)) +geom_point() +geom_smooth(method ="lm", se =FALSE) +labs(title ="Sale Price vs. Number of Bedrooms") ##data is very spread out away from the line of best fit for number of bathrooms, and outliers like the houses with one or five bedrooms might be altering it, but it overall shows with more bedrooms comes a higher sale price, but the relationship is not super strong
`geom_smooth()` using formula = 'y ~ x'
ggplot(homeprice, aes(x = sale, y = rooms)) +geom_point() +geom_smooth(method ="lm", se =FALSE) +labs(title ="Sale Price vs. Number of non-bedrooms") ##data is more grouped around line of best fit, but there are some outliers with more or less rooms that might be altering data. Besides that, data shows a decently strong relationship between more non-bedrooms and a higher sale price
`geom_smooth()` using formula = 'y ~ x'
ggplot(homeprice, aes(x = sale, y = neighborhood)) +geom_point() +geom_smooth(method ="lm", se =FALSE) +labs(title ="Sale Price vs. Neighborhood rank") ##very strong relationship between neighborhood rank and sale price. data is heavly centered around line of best fit, with few outliers, showing strongly how a higher neighborhood rank comes a higher sale price
`geom_smooth()` using formula = 'y ~ x'
##two strongest variable relationsips between sale price are list price and neighborhood rank, these are the only two variables not based around types of rooms. Besides that, the relationship is strongest in these two examples, as data is most closely grouped around line of best fit, and it makes most sense that a higher sale price would go with a higher list price and the higher a neighborhood ranks, the higher houses would be sold for there
sale_price <-lm(sale ~ list + full + half + bedrooms + rooms + neighborhood, data = homeprice) ##building multiple linear regression model to explain sale pricesummary(sale_price) ##list has the highest t-value, making it the only statisically signficant variable to be related to sale price, residuals are not huge which means data is not super biased
Call:
lm(formula = sale ~ list + full + half + bedrooms + rooms + neighborhood,
data = homeprice)
Residuals:
Min 1Q Median 3Q Max
-28.807 -6.626 -0.270 5.580 32.933
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.13359 17.15496 0.299 0.768
list 0.97131 0.07616 12.754 1.22e-11 ***
full -4.97759 5.48033 -0.908 0.374
half -1.00644 5.70418 -0.176 0.862
bedrooms 2.49224 6.43616 0.387 0.702
rooms -0.43411 3.70424 -0.117 0.908
neighborhood 2.03434 6.88609 0.295 0.770
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.87 on 22 degrees of freedom
Multiple R-squared: 0.989, Adjusted R-squared: 0.986
F-statistic: 330.5 on 6 and 22 DF, p-value: < 2.2e-16
anova(sale_price) ##list is only signficant variable as it has the smallest p-value, meaning that has the greatest effect on sale price
Analysis of Variance Table
Response: sale
Df Sum Sq Mean Sq F value Pr(>F)
list 1 381050 381050 1981.6252 <2e-16 ***
full 1 156 156 0.8116 0.3774
half 1 21 21 0.1092 0.7441
bedrooms 1 25 25 0.1314 0.7204
rooms 1 3 3 0.0141 0.9065
neighborhood 1 17 17 0.0873 0.7704
Residuals 22 4230 192
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
residuals <-residuals(sale_price)hist(residuals, main ="Histogram of Residuals", xlab ="Residuals") ##residuals are right skewed, not completely normal, meaning they are partially biased
list_price <-lm(list ~ sale + full + half + bedrooms + rooms + neighborhood, data = homeprice) ##building multiple linear regression model to explain list pricesummary(list_price) ##sale price has the smallest p value, making it the only variable to have a signficant relationship with list price
Call:
lm(formula = list ~ sale + full + half + bedrooms + rooms + neighborhood,
data = homeprice)
Residuals:
Min 1Q Median 3Q Max
-27.8544 -6.7013 -0.7265 6.7894 31.3427
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -21.8752 15.9419 -1.372 0.184
sale 0.9069 0.0711 12.754 1.22e-11 ***
full 8.3411 5.0923 1.638 0.116
half 6.3398 5.3475 1.186 0.248
bedrooms -0.0627 6.2402 -0.010 0.992
rooms 1.2426 3.5706 0.348 0.731
neighborhood 7.3793 6.4787 1.139 0.267
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.4 on 22 degrees of freedom
Multiple R-squared: 0.9903, Adjusted R-squared: 0.9876
F-statistic: 373.3 on 6 and 22 DF, p-value: < 2.2e-16
anova(list_price) ##sale price has the only significant p value, being the smallest, making it the only variable to have a signficant relationship with list price
Analysis of Variance Table
Response: list
Df Sum Sq Mean Sq F value Pr(>F)
sale 1 401374 401374 2235.5702 <2e-16 ***
full 1 346 346 1.9259 0.1791
half 1 134 134 0.7440 0.3977
bedrooms 1 4 4 0.0209 0.8864
rooms 1 24 24 0.1326 0.7192
neighborhood 1 233 233 1.2973 0.2670
Residuals 22 3950 180
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
residuals <-residuals(list_price) hist(residuals, main ="Histogram of Residuals", xlab ="Residuals") ##more normally distrbuted then sale price, meaning it has less bias then sale price
##there are not differences from sale price for this data, as list price and sale price are still the main variables that have a large effect on eachother. As none of the other variables have a signficant relationship, technically this does not say what characterestics real estate should base a house on
homeprice$diff <- homeprice$sale - homeprice$list ##calculating to see difference between sale and list priceneighborhood_diff <-lm(diff ~ neighborhood, data = homeprice) ##running linear regression to see how variables effect eachothersummary(neighborhood_diff) ##p-value is not statistically signficant, meaning there is not a relationship between neighborhood ranking and listing price
Call:
lm(formula = diff ~ neighborhood, data = homeprice)
Residuals:
Min 1Q Median 3Q Max
-30.05 -7.50 -0.85 5.80 33.05
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.800 7.435 1.049 0.303
neighborhood -3.150 2.428 -1.298 0.205
Residual standard error: 13 on 27 degrees of freedom
Multiple R-squared: 0.0587, Adjusted R-squared: 0.02383
F-statistic: 1.684 on 1 and 27 DF, p-value: 0.2054
anova(neighborhood_diff) ##p-value is not statistically signficant, meaning that once again there is no statistical relationship between listing price and neighborhood ranking
Analysis of Variance Table
Response: diff
Df Sum Sq Mean Sq F value Pr(>F)
neighborhood 1 284.7 284.67 1.6837 0.2054
Residuals 27 4565.1 169.08
##as there is no statistically signficant relationship between variables, even though one might be expected, it shows richer neighborhoods do not generally tend to ask over the listing price, was rather suprising as i assumed the richer neighborhoods would have a correlation with inflated prices