# load packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Establish the general directory
getwd()
## [1] "/cloud/project/housing"
# Establish the specific directory
list.files("/cloud/project/housing")
## [1] "analysis"
## [2] "Analysis Report of Housing Dataset.Rmd"
## [3] "Analysis-Report-of-Housing-Dataset.pdf"
## [4] "Analysis-Report-of-Housing-Dataset.Rmd"
## [5] "Housing_Dataset.csv"
# Load the dataset
housing <- read.csv("/cloud/project/housing/Housing_Dataset.csv")
head (housing)
## HouseID Price Size Bedrooms Bathrooms Garage YearBuilt Neighborhood
## 1 1 535243 2105 3 2 1 2018 C
## 2 2 258369 3621 2 1 0 2014 C
## 3 3 213112 3818 2 1 1 1967 C
## 4 4 588786 3281 5 2 0 1989 A
## 5 5 344194 1899 2 1 0 1968 B
## 6 6 328393 3804 3 1 1 2011 A
## Condition DistanceToCity SchoolRating HasPool DaysOnMarket SellingStatus
## 1 Excellent 7.2 6 0 110 Available
## 2 Excellent 3.6 5 1 79 Sold
## 3 Fair 1.7 6 0 56 Available
## 4 Poor 14.2 2 0 77 Available
## 5 Fair 7.7 7 1 74 Sold
## 6 Excellent 12.5 9 0 44 Available
# What are the Average prices for the different Neighborhoods?
Average_cost <- housing %>%
group_by (Neighborhood)%>%
summarise(avg_price=mean(Price,na.rm=TRUE),.groups='drop')
print(Average_cost)
## # A tibble: 3 × 2
## Neighborhood avg_price
## <chr> <dbl>
## 1 A 404160.
## 2 B 414912.
## 3 C 364091.
# visualize results
library(ggplot2)
ggplot(data=Average_cost,mapping=aes(x=Neighborhood,y=avg_price))+
geom_col(fill='orange')+labs(title='Average Prices for Neighborhoods')

# What is the average, median, and mode price of houses across all locations?
summary(housing)
## HouseID Price Size Bedrooms Bathrooms
## Min. : 1.00 Min. :203407 Min. :1002 Min. :2.00 Min. :1.00
## 1st Qu.: 25.75 1st Qu.:288711 1st Qu.:1793 1st Qu.:2.00 1st Qu.:1.00
## Median : 50.50 Median :387236 Median :2508 Median :3.00 Median :2.00
## Mean : 50.50 Mean :394779 Mean :2516 Mean :3.42 Mean :2.27
## 3rd Qu.: 75.25 3rd Qu.:511099 3rd Qu.:3270 3rd Qu.:5.00 3rd Qu.:3.00
## Max. :100.00 Max. :597833 Max. :3957 Max. :5.00 Max. :4.00
## Garage YearBuilt Neighborhood Condition
## Min. :0.00 Min. :1960 Length:100 Length:100
## 1st Qu.:0.00 1st Qu.:1980 Class :character Class :character
## Median :1.00 Median :1992 Mode :character Mode :character
## Mean :0.54 Mean :1993
## 3rd Qu.:1.00 3rd Qu.:2008
## Max. :1.00 Max. :2020
## DistanceToCity SchoolRating HasPool DaysOnMarket
## Min. : 1.000 Min. : 1.00 Min. :0.00 Min. : 11.00
## 1st Qu.: 4.950 1st Qu.: 4.00 1st Qu.:0.00 1st Qu.: 42.75
## Median : 8.450 Median : 6.00 Median :1.00 Median : 69.00
## Mean : 8.165 Mean : 5.95 Mean :0.53 Mean : 67.78
## 3rd Qu.:11.025 3rd Qu.: 8.00 3rd Qu.:1.00 3rd Qu.: 96.00
## Max. :15.000 Max. :10.00 Max. :1.00 Max. :120.00
## SellingStatus
## Length:100
## Class :character
## Mode :character
##
##
##
# What is the distribution of house prices?
ggplot(data=housing,mapping=aes(x=Price))+geom_histogram(fill='lightblue',color='black')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# How does the total price vary by location?
location_price <- housing %>%
group_by(Neighborhood)%>%
summarise(Total_price=sum(Price,na.rm=TRUE),.groups='drop')
print(location_price)
## # A tibble: 3 × 2
## Neighborhood Total_price
## <chr> <int>
## 1 A 14549772
## 2 B 13277178
## 3 C 11650926
# visualize output
ggplot(data=location_price,mapping=aes(x=Neighborhood,y=Total_price,fill=Neighborhood))+
geom_col()+labs(title='Total price by Location')

# What is the distribution of property sizes (in square footage)?
ggplot(data=housing,mapping=aes(x=Size))+geom_histogram(fill='lightblue',color='black')+
labs(title='Distribution of Property Sizes')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Are there any noticeable trends in the construction year of properties
ggplot(data=housing,mapping=aes(x=YearBuilt,y=Price))+
geom_point(color='blue',alpha=0.6)+geom_smooth()+
labs(title="Trends in construction year of properties")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Is there a significant difference in prices between houses with and without swimming pools?
sig_diff <- t.test(Price ~ HasPool, data=housing)
print(sig_diff)
##
## Welch Two Sample t-test
##
## data: Price by HasPool
## t = -0.44645, df = 96.781, p-value = 0.6563
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -58779.90 37192.36
## sample estimates:
## mean in group 0 mean in group 1
## 389058.1 399851.8
# visualize results
ggplot(data=housing,mapping=aes(x=factor(HasPool),y=Price))+
geom_boxplot(fill='blue',alpha=0.6)+
labs(title='Price difference between houses')

# Do houses in urban areas tend to cost more than those in rural areas?
anova_result <- aov(Price ~ Neighborhood, data=housing)
print(anova_result)
## Call:
## aov(formula = Price ~ Neighborhood, data = housing)
##
## Terms:
## Neighborhood Residuals
## Sum of Squares 4.627415e+10 1.385168e+12
## Deg. of Freedom 2 97
##
## Residual standard error: 119499.3
## Estimated effects may be unbalanced
# visualize results
ggplot(data=housing,mapping=aes(x=factor(Neighborhood),y=Price))+
geom_boxplot(fill='lightblue',alpha=0.6)+
labs(title='Association between Neighborhood and Cost')

#How much do square footage, the number of bedrooms, and the presence of a swimming pool contribute to predicting house prices?
model1 <- lm(Price~Size+Bedrooms+HasPool,data=housing)
print(model1)
##
## Call:
## lm(formula = Price ~ Size + Bedrooms + HasPool, data = housing)
##
## Coefficients:
## (Intercept) Size Bedrooms HasPool
## 384400.730 -4.023 4405.079 10251.589
# visualize output for Price and Size
ggplot(data=housing,mapping=aes(x=Price,y=Size,))+geom_point()+geom_smooth()+labs(title='How variables influence price')
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#Does the number of years since construction influence house prices
model2 <- lm(Price~YearBuilt,data=housing)
print(model2)
##
## Call:
## lm(formula = Price ~ YearBuilt, data = housing)
##
## Coefficients:
## (Intercept) YearBuilt
## -1199417.9 799.8
# visualize output
ggplot(data=housing,mapping=aes(x=Price,y=YearBuilt))+geom_point(color='blue',alpha=0.6)+
geom_smooth()+
labs(title='How Years since construction affect house prices')
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Which properties fall in the top 5% and bottom 5% of prices?
# step1: establish 5th and 95th percentiles
bottom_5 <- quantile(housing$Price,0.05,na.rm=TRUE)
top_95 <- quantile(housing$Price,0.95,na.rm=TRUE)
# step2: establish properties in the bottome and top 5%
bottom_properties <- housing[housing$Price <= bottom_5,]
top_properties <- housing[housing$Price >=top_95,]
# step3: print results
print(bottom_properties)
## HouseID Price Size Bedrooms Bathrooms Garage YearBuilt Neighborhood
## 3 3 213112 3818 2 1 1 1967 C
## 17 17 216663 3940 5 3 1 1999 A
## 18 18 215622 2288 5 2 1 2012 B
## 24 24 213912 2879 2 3 0 2018 C
## 36 36 203407 2482 4 3 0 2013 B
## Condition DistanceToCity SchoolRating HasPool DaysOnMarket SellingStatus
## 3 Fair 1.7 6 0 56 Available
## 17 Good 9.5 8 0 24 Sold
## 18 Fair 12.1 1 0 69 Sold
## 24 Good 14.9 8 0 96 Sold
## 36 Poor 1.0 2 1 43 Sold
print(top_properties)
## HouseID Price Size Bedrooms Bathrooms Garage YearBuilt Neighborhood
## 4 4 588786 3281 5 2 0 1989 A
## 9 9 586123 1938 3 1 0 1993 B
## 12 12 588323 2643 3 1 0 1980 B
## 37 37 597833 1898 5 3 1 2000 B
## 55 55 582589 3167 5 4 1 1994 A
## Condition DistanceToCity SchoolRating HasPool DaysOnMarket SellingStatus
## 4 Poor 14.2 2 0 77 Available
## 9 Poor 9.8 8 1 42 Available
## 12 Poor 8.9 4 0 113 Available
## 37 Good 7.3 4 0 78 Sold
## 55 Fair 2.0 10 1 89 Available
# visualize output
ggplot(data=housing,mapping=aes(x=Price))+geom_histogram(binwidth=5000,fill='lightblue',color='black')+
geom_vline(xintercept = quantile(housing$Price,0.05,na.rm=TRUE),linetype = 'dashed',color='blue')+
geom_vline(xintercept = quantile(housing$Price,0.95,na.rm=TRUE),linetype = 'dashed',color='green')+
labs(title='Outlier Properties')

# how does property price vary by condition
condition_price <- housing %>%
group_by(Condition)%>%
summarise(Avg_price=mean(Price,na.rm=TRUE),.groups='drop')
print(condition_price)
## # A tibble: 4 × 2
## Condition Avg_price
## <chr> <dbl>
## 1 Excellent 389179.
## 2 Fair 397169.
## 3 Good 358899
## 4 Poor 423705.
# visualize output
ggplot(data=condition_price,mapping=aes(x=Condition,y=Avg_price,fill=Condition))+
geom_col(color='black')+ labs(title='How property price varies by Condition')

# what is the price range by number of bedrooms
bedroom_price <- housing %>%
group_by (Bedrooms)%>%
summarise(Avg_price=mean(Price,na.rm=TRUE),.groups='drop')
print(bedroom_price)
## # A tibble: 4 × 2
## Bedrooms Avg_price
## <int> <dbl>
## 1 2 394494.
## 2 3 381591.
## 3 4 406563.
## 4 5 402968.
# visualize output
ggplot(data=bedroom_price,mapping=aes(x=Bedrooms,y=Avg_price))+
geom_col(fill='lightblue',color='black')+
labs(title='Price range by no of bedrooms')
