# load r packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
library(ggplot2)
# list files in the cloud/project/real estate directory
list.files("/cloud/project/real_estate")
## [1] "2021.xlsx" "2023.xlsx" "2024.xlsx"
# load datasets
data_2022 <- read_excel("/cloud/project/real_estate/2021.xlsx")
head(data_2022)
## # A tibble: 6 × 8
## sale_id sale_date property_type location sale_price sq_ft bedrooms
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 2022-03-19 Apartment Rural 270385 2480 5
## 2 1001 2022-11-16 House Downtown 271132 1588 5
## 3 1002 2022-07-20 Apartment Suburb 237756 2271 5
## 4 1003 2022-07-21 House Suburb 255674 1491 4
## 5 1004 2022-11-08 Apartment Suburb 203138 2143 1
## 6 1023 2022-02-25 Apartment Rural 294306 2177 3
## # ℹ 1 more variable: buyer_income <dbl>
data_2023 <- read_excel("/cloud/project/real_estate/2023.xlsx")
head(data_2023)
## # A tibble: 6 × 8
## sale_id sale_date property_type location sale_price sq_ft bedrooms
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 2023-02-11 Apartment Downtown 268745 1045 3
## 2 1001 2023-02-12 Condo Downtown 308022 821 3
## 3 1002 2023-11-01 Apartment Rural 221071 1876 4
## 4 1003 2023-03-11 House Rural 304075 1099 4
## 5 1004 2023-04-15 House Rural 247468 827 3
## 6 1005 2023-01-15 House Downtown 257589 2430 2
## # ℹ 1 more variable: buyer_income <dbl>
data_2024 <- read_excel("/cloud/project/real_estate/2024.xlsx")
head(data_2024)
## # A tibble: 6 × 8
## sale_id sale_date property_type location sale_price sq_ft bedrooms
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 2024-12-03 House Downtown 289998 2017 5
## 2 1001 2024-02-25 Condo Downtown 241795 NA 5
## 3 1002 2024-08-17 House Rural 275031 800 2
## 4 1003 Invalid Date House Suburb 252769 1850 3
## 5 1004 2024-06-02 House Downtown 329357 1952 5
## 6 1005 2024-02-20 Condo Downtown 259160 2184 3
## # ℹ 1 more variable: buyer_income <dbl>
# How many duplicate sale_id values exist in each dataset?
# Remove them and ensure uniqueness.
# step1: create column for years using mutate function
data_2022 <- data_2022 %>% mutate (Year='2022')
data_2023 <- data_2023 %>% mutate (Year='2023')
data_2024 <- data_2024 %>% mutate (Year='2024')
# step2: create a single table using rbind
combined_df <- rbind(data_2022,data_2023,data_2024)
print(combined_df)
## # A tibble: 300 × 9
## sale_id sale_date property_type location sale_price sq_ft bedrooms
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 2022-03-19 Apartment Rural 270385 2480 5
## 2 1001 2022-11-16 House Downtown 271132 1588 5
## 3 1002 2022-07-20 Apartment Suburb 237756 2271 5
## 4 1003 2022-07-21 House Suburb 255674 1491 4
## 5 1004 2022-11-08 Apartment Suburb 203138 2143 1
## 6 1023 2022-02-25 Apartment Rural 294306 2177 3
## 7 1006 2022-10-04 House Rural 235741 1552 2
## 8 1007 2022-01-05 House Downtown 280516 2036 4
## 9 1008 2022-08-12 Condo Downtown 220989 1082 2
## 10 1009 2022-12-28 House Downtown 252248 895 4
## # ℹ 290 more rows
## # ℹ 2 more variables: buyer_income <dbl>, Year <chr>
# establish duplicate sale_ids
duplicates <- combined_df %>%
group_by(sale_id)%>%
summarise(count=n())%>%
filter(sale_id > 1)%>%
arrange(desc(count))
print(duplicates)
## # A tibble: 100 × 2
## sale_id count
## <dbl> <int>
## 1 1017 4
## 2 1019 4
## 3 1020 4
## 4 1024 4
## 5 1030 4
## 6 1033 4
## 7 1037 4
## 8 1044 4
## 9 1069 4
## 10 1074 4
## # ℹ 90 more rows
# remove duplicates
unique_ids <- combined_df %>%
distinct(sale_id,.keep_all = TRUE)
print(unique_ids)
## # A tibble: 100 × 9
## sale_id sale_date property_type location sale_price sq_ft bedrooms
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 2022-03-19 Apartment Rural 270385 2480 5
## 2 1001 2022-11-16 House Downtown 271132 1588 5
## 3 1002 2022-07-20 Apartment Suburb 237756 2271 5
## 4 1003 2022-07-21 House Suburb 255674 1491 4
## 5 1004 2022-11-08 Apartment Suburb 203138 2143 1
## 6 1023 2022-02-25 Apartment Rural 294306 2177 3
## 7 1006 2022-10-04 House Rural 235741 1552 2
## 8 1007 2022-01-05 House Downtown 280516 2036 4
## 9 1008 2022-08-12 Condo Downtown 220989 1082 2
## 10 1009 2022-12-28 House Downtown 252248 895 4
## # ℹ 90 more rows
## # ℹ 2 more variables: buyer_income <dbl>, Year <chr>
# confirm duplicates removed
duplicate_test <- unique_ids %>%
group_by(sale_id)%>%
summarise(count=n())%>%
filter(sale_id > 1)%>%
arrange(desc(count))
print(duplicate_test)
## # A tibble: 100 × 2
## sale_id count
## <dbl> <int>
## 1 1000 1
## 2 1001 1
## 3 1002 1
## 4 1003 1
## 5 1004 1
## 6 1005 1
## 7 1006 1
## 8 1007 1
## 9 1008 1
## 10 1009 1
## # ℹ 90 more rows
# How many missing values are present in sq_ft and buyer_income?
# Replace them with appropriate measures (mean).
# Establish missing values in sq_fit and buyer income
missing_values_sf <- combined_df %>%
filter(is.na(sq_ft))%>%
summarise(count=n())
print(missing_values_sf)
## # A tibble: 1 × 1
## count
## <int>
## 1 15
missing_values_bi <- combined_df %>%
filter(is.na(buyer_income))%>%
summarise(count=n())
print(missing_values_bi)
## # A tibble: 1 × 1
## count
## <int>
## 1 15
# remove duplicates
unique_df <- combined_df %>%
filter(is.na(sale_id))%>%
summarise(count=n())
print(unique_df)
## # A tibble: 1 × 1
## count
## <int>
## 1 0
# calculate the mean sq_ft and buyer income for not null
mean_variables <- combined_df %>%
filter(!is.na(sq_ft),!is.na(buyer_income))%>%
summarise(
avg_sqft=mean(sq_ft),
avg_income=mean(buyer_income)
)
print(mean_variables)
## # A tibble: 1 × 2
## avg_sqft avg_income
## <dbl> <dbl>
## 1 1657. 82985.
# replace missing values with mean
combined_df <- combined_df %>%
mutate(
sq_ft = ifelse(is.na(sq_ft),'1657',sq_ft),
buyer_income=ifelse(is.na(buyer_income),'82985',buyer_income)
)
# Verify the replacement
print(combined_df)
## # A tibble: 300 × 9
## sale_id sale_date property_type location sale_price sq_ft bedrooms
## <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 1000 2022-03-19 Apartment Rural 270385 2480 5
## 2 1001 2022-11-16 House Downtown 271132 1588 5
## 3 1002 2022-07-20 Apartment Suburb 237756 2271 5
## 4 1003 2022-07-21 House Suburb 255674 1491 4
## 5 1004 2022-11-08 Apartment Suburb 203138 2143 1
## 6 1023 2022-02-25 Apartment Rural 294306 2177 3
## 7 1006 2022-10-04 House Rural 235741 1552 2
## 8 1007 2022-01-05 House Downtown 280516 2036 4
## 9 1008 2022-08-12 Condo Downtown 220989 1082 2
## 10 1009 2022-12-28 House Downtown 252248 895 4
## # ℹ 290 more rows
## # ℹ 2 more variables: buyer_income <chr>, Year <chr>
#Do buyers in Downtown pay significantly higher prices than those in Suburb?
anova_result <- aov(combined_df$sale_price~combined_df$location)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## combined_df$location 2 4.085e+08 2.042e+08 0.174 0.84
## Residuals 297 3.479e+11 1.171e+09
# visualize results
ggplot(data=combined_df,mapping=aes(x=location,y=sale_price))+
geom_boxplot(color='black', fill='lightblue')+
labs(title='Relationship between sale prices and location')

# how does sales price vary by month
#convert date column to month
combined_df <- combined_df %>%
filter(sale_date !='Invalid Date')%>%
mutate(
sale_date <- as.Date(sale_date,format='%Y,%m,%dd'),
month_sale_date = month(sale_date,label=TRUE, abbr=TRUE)
)
#establish price variance by month and property type
variance <- combined_df %>%
group_by(month_sale_date)%>%
summarise(Total_sales=sum(sale_price,na.rm=TRUE),.groups='drop')
#print results
print(variance)
## # A tibble: 12 × 2
## month_sale_date Total_sales
## <ord> <dbl>
## 1 Jan 7854935
## 2 Feb 7297647
## 3 Mar 4712788
## 4 Apr 6809031
## 5 May 5417996
## 6 Jun 7680867
## 7 Jul 5493577
## 8 Aug 6578342
## 9 Sep 7088732
## 10 Oct 6057836
## 11 Nov 8016794
## 12 Dec 6388169
#visualize results
ggplot(data=variance,mapping=aes(x=month_sale_date,y=Total_sales))+
geom_col(color='black',fill='orange')+
labs(title='Sales Price Generated by Month')

#Build a linear regression model to predict sale_price using, bedrooms and buyer income
linear_model <- lm(sale_price~bedrooms+buyer_income,data=combined_df)
print(linear_model)
##
## Call:
## lm(formula = sale_price ~ bedrooms + buyer_income, data = combined_df)
##
## Coefficients:
## (Intercept) bedrooms buyer_income100475 buyer_income100567
## 297274.5 3628.2 -2322.2 -14874.2
## buyer_income100921 buyer_income101124 buyer_income101125 buyer_income101146
## -13621.3 -62040.0 -13345.7 -4580.3
## buyer_income101278 buyer_income101691 buyer_income102103 buyer_income102151
## -11520.8 -35686.0 30402.2 -34115.3
## buyer_income102547 buyer_income102551 buyer_income103048 buyer_income103203
## 3696.3 -15858.7 -51520.0 -62371.2
## buyer_income103801 buyer_income103964 buyer_income104303 buyer_income104357
## -29499.8 -49281.7 -17501.0 30807.2
## buyer_income104386 buyer_income104549 buyer_income104641 buyer_income105309
## 20371.7 11790.0 10525.7 -8217.2
## buyer_income105654 buyer_income105755 buyer_income105768 buyer_income106000
## -50067.8 -26494.3 -3886.2 9516.0
## buyer_income106100 buyer_income106315 buyer_income107092 buyer_income107532
## -52983.0 14305.0 -39892.0 -89776.3
## buyer_income108336 buyer_income108549 buyer_income108974 buyer_income110554
## -55440.0 -42017.3 -73620.3 11473.8
## buyer_income110758 buyer_income111266 buyer_income111322 buyer_income111889
## -17598.2 5217.8 -43598.2 -26522.7
## buyer_income113225 buyer_income114754 buyer_income115835 buyer_income116076
## 31536.2 7944.7 13528.3 -36877.3
## buyer_income116478 buyer_income117225 buyer_income117724 buyer_income118507
## -54252.0 -3128.8 -18804.8 -68473.3
## buyer_income119512 buyer_income119831 buyer_income50397 buyer_income51407
## -25417.3 6432.8 -102732.8 -41450.7
## buyer_income51446 buyer_income52523 buyer_income52623 buyer_income52969
## -101669.8 -28993.8 -115358.3 -71107.7
## buyer_income53085 buyer_income54956 buyer_income54977 buyer_income54995
## -18717.3 -99864.0 -78241.2 -83215.8
## buyer_income55838 buyer_income56348 buyer_income57119 buyer_income57380
## -58754.7 -82944.8 -52226.7 -83744.7
## buyer_income58795 buyer_income58960 buyer_income59032 buyer_income59206
## -93206.0 -87078.0 -109366.2 -102606.2
## buyer_income59787 buyer_income59824 buyer_income60053 buyer_income60166
## -62154.2 -98829.8 -22223.2 -39978.7
## buyer_income60317 buyer_income60438 buyer_income60782 buyer_income60792
## -82664.8 -61064.2 -65459.7 -13853.0
## buyer_income61307 buyer_income61357 buyer_income61425 buyer_income61753
## -111162.2 -75727.8 -71853.8 -46818.3
## buyer_income62068 buyer_income62312 buyer_income62596 buyer_income62686
## -23745.8 -47025.0 -44283.3 -45030.3
## buyer_income62810 buyer_income62935 buyer_income63042 buyer_income63173
## -65811.2 -70187.2 -90245.2 -6712.8
## buyer_income63442 buyer_income63522 buyer_income63628 buyer_income63784
## -54404.8 16132.3 -42276.3 -33493.8
## buyer_income63813 buyer_income64081 buyer_income64573 buyer_income64750
## 854.8 -32091.8 -30765.8 -60776.3
## buyer_income64929 buyer_income65007 buyer_income65028 buyer_income65284
## -56334.3 -50679.8 -20135.0 -64425.0
## buyer_income65331 buyer_income66158 buyer_income66240 buyer_income66991
## -27947.7 -64071.0 -53416.3 -44573.8
## buyer_income67698 buyer_income67718 buyer_income67887 buyer_income68092
## -36279.0 -77659.3 -100069.0 -48158.3
## buyer_income68334 buyer_income68488 buyer_income68813 buyer_income69322
## -59114.8 -71495.3 -39790.2 -57008.0
## buyer_income69347 buyer_income69348 buyer_income69419 buyer_income69628
## -90716.2 -100055.2 -60691.0 -18996.8
## buyer_income69636 buyer_income69665 buyer_income69717 buyer_income69878
## -35472.0 -65256.7 -51308.3 -83928.2
## buyer_income69891 buyer_income70046 buyer_income70377 buyer_income70601
## -19815.0 14350.7 -21022.0 -80638.2
## buyer_income70612 buyer_income70820 buyer_income70977 buyer_income71219
## -21607.7 7099.2 -25201.7 -10679.7
## buyer_income71837 buyer_income71892 buyer_income71972 buyer_income72145
## 8699.2 -45047.3 -59539.2 -36768.3
## buyer_income72208 buyer_income72334 buyer_income72360 buyer_income72545
## -13698.7 -12307.2 -280.7 -36129.8
## buyer_income73008 buyer_income73022 buyer_income73137 buyer_income73393
## -34788.0 -59371.8 -19632.2 -79681.3
## buyer_income73413 buyer_income73706 buyer_income73715 buyer_income73811
## -60904.3 -19207.8 -46360.7 -66510.8
## buyer_income73812 buyer_income73990 buyer_income74001 buyer_income74767
## -83541.8 -5635.3 -75479.8 -65565.7
## buyer_income74779 buyer_income74904 buyer_income74933 buyer_income74947
## 10079.8 -61837.0 -72234.2 -12245.8
## buyer_income75067 buyer_income75186 buyer_income75245 buyer_income75439
## -137.0 -60075.8 -94457.3 -77705.3
## buyer_income75475 buyer_income76059 buyer_income76409 buyer_income76479
## -76964.3 -63703.3 -49190.0 -45933.8
## buyer_income76514 buyer_income76616 buyer_income76917 buyer_income76993
## -54321.2 -47513.8 -18026.8 -68102.0
## buyer_income77373 buyer_income77421 buyer_income77486 buyer_income77778
## -35958.0 -22462.8 -37898.8 -65623.3
## buyer_income77987 buyer_income78275 buyer_income78476 buyer_income78552
## 5664.3 -9055.0 -49324.2 -57545.8
## buyer_income78673 buyer_income78763 buyer_income78818 buyer_income79280
## -79678.8 -10924.2 21589.8 -6242.8
## buyer_income79555 buyer_income79805 buyer_income79814 buyer_income79863
## -31271.2 -28828.8 -83564.2 -83015.7
## buyer_income79915 buyer_income80241 buyer_income80396 buyer_income80449
## -30188.0 -96644.3 -26343.3 7996.2
## buyer_income81284 buyer_income81374 buyer_income81470 buyer_income81506
## -77872.0 -3033.8 -72807.7 -41361.3
## buyer_income81637 buyer_income81854 buyer_income82530 buyer_income82619
## -4117.7 8729.7 -59963.8 -32833.7
## buyer_income82774 buyer_income82806 buyer_income82923 buyer_income82985
## -59735.0 -53936.8 -44611.0 -32759.2
## buyer_income83079 buyer_income83120 buyer_income83782 buyer_income84345
## -67921.7 -51182.7 -49184.3 -4957.3
## buyer_income84523 buyer_income84992 buyer_income85056 buyer_income85306
## -792.0 -23146.2 -20049.7 13941.7
## buyer_income85325 buyer_income85672 buyer_income85674 buyer_income85784
## -18100.2 -54829.8 -45980.0 -1578.2
## buyer_income86007 buyer_income86252 buyer_income86392 buyer_income86434
## -4834.8 6465.8 -17936.7 28119.0
## buyer_income86628 buyer_income88171 buyer_income88181 buyer_income88369
## 29259.3 -70923.3 -41638.8 2915.2
## buyer_income88509 buyer_income88565 buyer_income88635 buyer_income88762
## -108094.0 -76137.2 -69976.2 -7712.2
## buyer_income88875 buyer_income89218 buyer_income89264 buyer_income89310
## 4305.3 35365.3 -24894.2 -83037.7
## buyer_income89952 buyer_income90065 buyer_income90298 buyer_income90330
## 25509.8 -39414.0 19230.0 -106837.0
## buyer_income90546 buyer_income90599 buyer_income90674 buyer_income90927
## -26683.3 -56722.7 -50744.7 -36182.7
## buyer_income91061 buyer_income91072 buyer_income91210 buyer_income91920
## -15612.0 -65046.0 -13990.8 17868.7
## buyer_income92007 buyer_income92009 buyer_income92024 buyer_income92039
## -76358.3 -76389.2 -64141.8 -31170.0
## buyer_income92583 buyer_income92601 buyer_income92859 buyer_income92972
## -41772.2 -29248.0 -12515.2 -27389.2
## buyer_income93059 buyer_income93153 buyer_income93679 buyer_income93696
## -32617.0 -11719.7 11483.8 -25461.0
## buyer_income93828 buyer_income93929 buyer_income93945 buyer_income94050
## -60037.7 6758.8 -13915.3 -31200.8
## buyer_income94224 buyer_income94291 buyer_income94477 buyer_income94737
## -104200.0 -36332.3 -28225.7 -21519.8
## buyer_income95412 buyer_income95570 buyer_income95644 buyer_income96382
## -21511.0 -721.8 -56113.2 -19222.0
## buyer_income96661 buyer_income96996 buyer_income97333 buyer_income97395
## -27691.0 -38607.3 -93839.0 -27008.8
## buyer_income97434 buyer_income97576 buyer_income97605 buyer_income97949
## -15183.2 -94931.7 -66459.7 -63826.3
## buyer_income98058 buyer_income98142 buyer_income98149 buyer_income98191
## -30900.2 15732.7 -71294.3 -85053.3
## buyer_income98244 buyer_income98297 buyer_income98861 buyer_income99023
## -68789.8 30228.2 1635.0 -37905.7
## buyer_income99474 buyer_income99894 buyer_income99897 buyer_income99980
## -73767.3 -20291.3 -48999.0 23698.2
#Identify and visualize outliers in sale_price
#calculate IQR
price_iQr <- IQR (combined_df$sale_price, na.rm=TRUE)
q1_price <- quantile(combined_df$sale_price,0.25,na.rm=TRUE)
q2_price <- quantile(combined_df$sale_price,0.75,na.rm=TRUE)
#define lower and upper bounds
lower_bound_price <- q1_price -1.5 * price_iQr
upper_bound_price <- q2_price +1.5 * price_iQr
#identify outliers
price_outliers <- combined_df$sale_price <lower_bound_price | combined_df$sale_price > upper_bound_price
#print result
outlier_values <- combined_df$sale_price[price_outliers]
print(outlier_values)
## numeric(0)
# what is the property sales price distribution by location
ggplot(data=combined_df,mapping=aes(x=location,y=sale_price,fill=location))+
geom_boxplot(color='black')+
labs(title='Property sales price distribution by location')

#how does the rate of income vary by location
#convert buyer income from character to numeric
combined_df$buyer_income <- as.numeric(combined_df$buyer_income)
#sum and mutate to establish rate
income_location <- combined_df %>%
group_by(location)%>%
summarise(
total_income = sum(buyer_income,na.rm=TRUE),
) %>%
mutate(
income_rate=(total_income/sum(total_income)) * 100
)
#print results
print(income_location)
## # A tibble: 3 × 3
## location total_income income_rate
## <chr> <dbl> <dbl>
## 1 Downtown 8616570 35.3
## 2 Rural 8627064 35.4
## 3 Suburb 7144350 29.3
#visualize results
ggplot(data=income_location,mapping=aes(x=location,y=income_rate))+
geom_col(color='black',fill='brown')+
labs(title='Rate of income by location')
