# load packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
# Establish the general directory
getwd()
## [1] "/cloud/project/housing"
# Establish the specific directory
list.files("/cloud/project/housing")
## [1] "analysis"                              
## [2] "Analysis Report of Housing Dataset.Rmd"
## [3] "Analysis-Report-of-Housing-Dataset.pdf"
## [4] "Analysis-Report-of-Housing-Dataset.Rmd"
## [5] "Housing_Dataset.csv"
# Load the dataset
housing <- read.csv("/cloud/project/housing/Housing_Dataset.csv")
head (housing)
##   HouseID  Price Size Bedrooms Bathrooms Garage YearBuilt Neighborhood
## 1       1 535243 2105        3         2      1      2018            C
## 2       2 258369 3621        2         1      0      2014            C
## 3       3 213112 3818        2         1      1      1967            C
## 4       4 588786 3281        5         2      0      1989            A
## 5       5 344194 1899        2         1      0      1968            B
## 6       6 328393 3804        3         1      1      2011            A
##   Condition DistanceToCity SchoolRating HasPool DaysOnMarket SellingStatus
## 1 Excellent            7.2            6       0          110     Available
## 2 Excellent            3.6            5       1           79          Sold
## 3      Fair            1.7            6       0           56     Available
## 4      Poor           14.2            2       0           77     Available
## 5      Fair            7.7            7       1           74          Sold
## 6 Excellent           12.5            9       0           44     Available
# What are the Average prices for the different Neighborhoods?

Average_cost <- housing %>%
  group_by (Neighborhood)%>%
  summarise(avg_price=mean(Price,na.rm=TRUE),.groups='drop')
print(Average_cost)
## # A tibble: 3 × 2
##   Neighborhood avg_price
##   <chr>            <dbl>
## 1 A              404160.
## 2 B              414912.
## 3 C              364091.
# visualize results
library(ggplot2)

ggplot(data=Average_cost,mapping=aes(x=Neighborhood,y=avg_price))+
  geom_col(fill='orange')+labs(title='Average Prices for Neighborhoods')

# What is the average, median, and mode price of houses across all locations?

summary(housing)
##     HouseID           Price             Size         Bedrooms      Bathrooms   
##  Min.   :  1.00   Min.   :203407   Min.   :1002   Min.   :2.00   Min.   :1.00  
##  1st Qu.: 25.75   1st Qu.:288711   1st Qu.:1793   1st Qu.:2.00   1st Qu.:1.00  
##  Median : 50.50   Median :387236   Median :2508   Median :3.00   Median :2.00  
##  Mean   : 50.50   Mean   :394779   Mean   :2516   Mean   :3.42   Mean   :2.27  
##  3rd Qu.: 75.25   3rd Qu.:511099   3rd Qu.:3270   3rd Qu.:5.00   3rd Qu.:3.00  
##  Max.   :100.00   Max.   :597833   Max.   :3957   Max.   :5.00   Max.   :4.00  
##      Garage       YearBuilt    Neighborhood        Condition        
##  Min.   :0.00   Min.   :1960   Length:100         Length:100        
##  1st Qu.:0.00   1st Qu.:1980   Class :character   Class :character  
##  Median :1.00   Median :1992   Mode  :character   Mode  :character  
##  Mean   :0.54   Mean   :1993                                        
##  3rd Qu.:1.00   3rd Qu.:2008                                        
##  Max.   :1.00   Max.   :2020                                        
##  DistanceToCity    SchoolRating      HasPool      DaysOnMarket   
##  Min.   : 1.000   Min.   : 1.00   Min.   :0.00   Min.   : 11.00  
##  1st Qu.: 4.950   1st Qu.: 4.00   1st Qu.:0.00   1st Qu.: 42.75  
##  Median : 8.450   Median : 6.00   Median :1.00   Median : 69.00  
##  Mean   : 8.165   Mean   : 5.95   Mean   :0.53   Mean   : 67.78  
##  3rd Qu.:11.025   3rd Qu.: 8.00   3rd Qu.:1.00   3rd Qu.: 96.00  
##  Max.   :15.000   Max.   :10.00   Max.   :1.00   Max.   :120.00  
##  SellingStatus     
##  Length:100        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# What is the distribution of house prices?

ggplot(data=housing,mapping=aes(x=Price))+geom_histogram(fill='lightblue',color='black')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# How does the total price vary by location?

location_price <- housing %>%
  group_by(Neighborhood)%>%
  summarise(Total_price=sum(Price,na.rm=TRUE),.groups='drop')
print(location_price)
## # A tibble: 3 × 2
##   Neighborhood Total_price
##   <chr>              <int>
## 1 A               14549772
## 2 B               13277178
## 3 C               11650926
# visualize output
ggplot(data=location_price,mapping=aes(x=Neighborhood,y=Total_price,fill=Neighborhood))+
  geom_col()+labs(title='Total price by Location')

# What is the distribution of property sizes (in square footage)?

ggplot(data=housing,mapping=aes(x=Size))+geom_histogram(fill='lightblue',color='black')+
  labs(title='Distribution of Property Sizes')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#   Are there any noticeable trends in the construction year of properties

ggplot(data=housing,mapping=aes(x=YearBuilt,y=Price))+
  geom_point(color='blue',alpha=0.6)+geom_smooth()+
  labs(title="Trends in construction year of properties")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Is there a significant difference in prices between houses with and without swimming pools? 

sig_diff <- t.test(Price ~ HasPool, data=housing)
print(sig_diff)
## 
##  Welch Two Sample t-test
## 
## data:  Price by HasPool
## t = -0.44645, df = 96.781, p-value = 0.6563
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -58779.90  37192.36
## sample estimates:
## mean in group 0 mean in group 1 
##        389058.1        399851.8
# visualize results
ggplot(data=housing,mapping=aes(x=factor(HasPool),y=Price))+
  geom_boxplot(fill='blue',alpha=0.6)+
  labs(title='Price difference between houses')

#   Do houses in urban areas tend to cost more than those in rural areas?
anova_result <- aov(Price ~ Neighborhood, data=housing)
print(anova_result)
## Call:
##    aov(formula = Price ~ Neighborhood, data = housing)
## 
## Terms:
##                 Neighborhood    Residuals
## Sum of Squares  4.627415e+10 1.385168e+12
## Deg. of Freedom            2           97
## 
## Residual standard error: 119499.3
## Estimated effects may be unbalanced
# visualize results
ggplot(data=housing,mapping=aes(x=factor(Neighborhood),y=Price))+
  geom_boxplot(fill='lightblue',alpha=0.6)+
  labs(title='Association between Neighborhood and Cost')

#How much do square footage, the number of bedrooms, and the presence of a swimming pool contribute to predicting house prices? 

model1 <- lm(Price~Size+Bedrooms+HasPool,data=housing)
print(model1)
## 
## Call:
## lm(formula = Price ~ Size + Bedrooms + HasPool, data = housing)
## 
## Coefficients:
## (Intercept)         Size     Bedrooms      HasPool  
##  384400.730       -4.023     4405.079    10251.589
# visualize output for Price and Size
ggplot(data=housing,mapping=aes(x=Price,y=Size,))+geom_point()+geom_smooth()+labs(title='How variables influence price')
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#Does the number of years since construction influence house prices

model2 <- lm(Price~YearBuilt,data=housing)
print(model2)
## 
## Call:
## lm(formula = Price ~ YearBuilt, data = housing)
## 
## Coefficients:
## (Intercept)    YearBuilt  
##  -1199417.9        799.8
# visualize output

ggplot(data=housing,mapping=aes(x=Price,y=YearBuilt))+geom_point(color='blue',alpha=0.6)+
  geom_smooth()+
  labs(title='How Years since construction affect house prices')
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Which properties fall in the top 5% and bottom 5% of prices?

# step1: establish 5th and 95th percentiles
bottom_5 <- quantile(housing$Price,0.05,na.rm=TRUE)
top_95 <- quantile(housing$Price,0.95,na.rm=TRUE)

# step2: establish properties in the bottome and top 5%
bottom_properties <- housing[housing$Price <= bottom_5,]
top_properties <- housing[housing$Price >=top_95,]

# step3: print results
print(bottom_properties)
##    HouseID  Price Size Bedrooms Bathrooms Garage YearBuilt Neighborhood
## 3        3 213112 3818        2         1      1      1967            C
## 17      17 216663 3940        5         3      1      1999            A
## 18      18 215622 2288        5         2      1      2012            B
## 24      24 213912 2879        2         3      0      2018            C
## 36      36 203407 2482        4         3      0      2013            B
##    Condition DistanceToCity SchoolRating HasPool DaysOnMarket SellingStatus
## 3       Fair            1.7            6       0           56     Available
## 17      Good            9.5            8       0           24          Sold
## 18      Fair           12.1            1       0           69          Sold
## 24      Good           14.9            8       0           96          Sold
## 36      Poor            1.0            2       1           43          Sold
print(top_properties)
##    HouseID  Price Size Bedrooms Bathrooms Garage YearBuilt Neighborhood
## 4        4 588786 3281        5         2      0      1989            A
## 9        9 586123 1938        3         1      0      1993            B
## 12      12 588323 2643        3         1      0      1980            B
## 37      37 597833 1898        5         3      1      2000            B
## 55      55 582589 3167        5         4      1      1994            A
##    Condition DistanceToCity SchoolRating HasPool DaysOnMarket SellingStatus
## 4       Poor           14.2            2       0           77     Available
## 9       Poor            9.8            8       1           42     Available
## 12      Poor            8.9            4       0          113     Available
## 37      Good            7.3            4       0           78          Sold
## 55      Fair            2.0           10       1           89     Available
# visualize output
ggplot(data=housing,mapping=aes(x=Price))+geom_histogram(binwidth=5000,fill='lightblue',color='black')+
  geom_vline(xintercept = quantile(housing$Price,0.05,na.rm=TRUE),linetype = 'dashed',color='blue')+
  geom_vline(xintercept = quantile(housing$Price,0.95,na.rm=TRUE),linetype = 'dashed',color='green')+
  labs(title='Outlier Properties')

# how does property price vary by condition
condition_price <- housing %>%
  group_by(Condition)%>%
  summarise(Avg_price=mean(Price,na.rm=TRUE),.groups='drop')
print(condition_price)
## # A tibble: 4 × 2
##   Condition Avg_price
##   <chr>         <dbl>
## 1 Excellent   389179.
## 2 Fair        397169.
## 3 Good        358899 
## 4 Poor        423705.
# visualize output
ggplot(data=condition_price,mapping=aes(x=Condition,y=Avg_price,fill=Condition))+
  geom_col(color='black')+ labs(title='How property price varies by Condition')

# what is the price range by number of bedrooms

bedroom_price <- housing %>%
  group_by (Bedrooms)%>%
  summarise(Avg_price=mean(Price,na.rm=TRUE),.groups='drop')
print(bedroom_price)
## # A tibble: 4 × 2
##   Bedrooms Avg_price
##      <int>     <dbl>
## 1        2   394494.
## 2        3   381591.
## 3        4   406563.
## 4        5   402968.
# visualize output
ggplot(data=bedroom_price,mapping=aes(x=Bedrooms,y=Avg_price))+
  geom_col(fill='lightblue',color='black')+
  labs(title='Price range by no of bedrooms')