# load r packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
library(ggplot2)

# list files in the cloud/project/real estate directory
list.files("/cloud/project/real_estate")
## [1] "2021.xlsx" "2023.xlsx" "2024.xlsx"
# load datasets
data_2022 <- read_excel("/cloud/project/real_estate/2021.xlsx")
head(data_2022)
## # A tibble: 6 × 8
##   sale_id sale_date  property_type location sale_price sq_ft bedrooms
##     <dbl> <chr>      <chr>         <chr>         <dbl> <dbl>    <dbl>
## 1    1000 2022-03-19 Apartment     Rural        270385  2480        5
## 2    1001 2022-11-16 House         Downtown     271132  1588        5
## 3    1002 2022-07-20 Apartment     Suburb       237756  2271        5
## 4    1003 2022-07-21 House         Suburb       255674  1491        4
## 5    1004 2022-11-08 Apartment     Suburb       203138  2143        1
## 6    1023 2022-02-25 Apartment     Rural        294306  2177        3
## # ℹ 1 more variable: buyer_income <dbl>
data_2023 <- read_excel("/cloud/project/real_estate/2023.xlsx")
head(data_2023)
## # A tibble: 6 × 8
##   sale_id sale_date  property_type location sale_price sq_ft bedrooms
##     <dbl> <chr>      <chr>         <chr>         <dbl> <dbl>    <dbl>
## 1    1000 2023-02-11 Apartment     Downtown     268745  1045        3
## 2    1001 2023-02-12 Condo         Downtown     308022   821        3
## 3    1002 2023-11-01 Apartment     Rural        221071  1876        4
## 4    1003 2023-03-11 House         Rural        304075  1099        4
## 5    1004 2023-04-15 House         Rural        247468   827        3
## 6    1005 2023-01-15 House         Downtown     257589  2430        2
## # ℹ 1 more variable: buyer_income <dbl>
data_2024 <- read_excel("/cloud/project/real_estate/2024.xlsx")
head(data_2024)
## # A tibble: 6 × 8
##   sale_id sale_date    property_type location sale_price sq_ft bedrooms
##     <dbl> <chr>        <chr>         <chr>         <dbl> <dbl>    <dbl>
## 1    1000 2024-12-03   House         Downtown     289998  2017        5
## 2    1001 2024-02-25   Condo         Downtown     241795    NA        5
## 3    1002 2024-08-17   House         Rural        275031   800        2
## 4    1003 Invalid Date House         Suburb       252769  1850        3
## 5    1004 2024-06-02   House         Downtown     329357  1952        5
## 6    1005 2024-02-20   Condo         Downtown     259160  2184        3
## # ℹ 1 more variable: buyer_income <dbl>
# How many duplicate sale_id values exist in each dataset? 
# Remove them and ensure uniqueness.

# step1: create column for years using mutate function
data_2022 <- data_2022 %>% mutate (Year='2022')
data_2023 <- data_2023 %>% mutate (Year='2023')
data_2024 <- data_2024 %>% mutate (Year='2024')

# step2: create a single table using rbind
combined_df <- rbind(data_2022,data_2023,data_2024)

print(combined_df)
## # A tibble: 300 × 9
##    sale_id sale_date  property_type location sale_price sq_ft bedrooms
##      <dbl> <chr>      <chr>         <chr>         <dbl> <dbl>    <dbl>
##  1    1000 2022-03-19 Apartment     Rural        270385  2480        5
##  2    1001 2022-11-16 House         Downtown     271132  1588        5
##  3    1002 2022-07-20 Apartment     Suburb       237756  2271        5
##  4    1003 2022-07-21 House         Suburb       255674  1491        4
##  5    1004 2022-11-08 Apartment     Suburb       203138  2143        1
##  6    1023 2022-02-25 Apartment     Rural        294306  2177        3
##  7    1006 2022-10-04 House         Rural        235741  1552        2
##  8    1007 2022-01-05 House         Downtown     280516  2036        4
##  9    1008 2022-08-12 Condo         Downtown     220989  1082        2
## 10    1009 2022-12-28 House         Downtown     252248   895        4
## # ℹ 290 more rows
## # ℹ 2 more variables: buyer_income <dbl>, Year <chr>
# establish duplicate sale_ids
duplicates <- combined_df %>%
  group_by(sale_id)%>%
  summarise(count=n())%>%
  filter(sale_id > 1)%>%
  arrange(desc(count))
print(duplicates)
## # A tibble: 100 × 2
##    sale_id count
##      <dbl> <int>
##  1    1017     4
##  2    1019     4
##  3    1020     4
##  4    1024     4
##  5    1030     4
##  6    1033     4
##  7    1037     4
##  8    1044     4
##  9    1069     4
## 10    1074     4
## # ℹ 90 more rows
# remove duplicates
unique_ids <- combined_df %>%
  distinct(sale_id,.keep_all = TRUE)
print(unique_ids)
## # A tibble: 100 × 9
##    sale_id sale_date  property_type location sale_price sq_ft bedrooms
##      <dbl> <chr>      <chr>         <chr>         <dbl> <dbl>    <dbl>
##  1    1000 2022-03-19 Apartment     Rural        270385  2480        5
##  2    1001 2022-11-16 House         Downtown     271132  1588        5
##  3    1002 2022-07-20 Apartment     Suburb       237756  2271        5
##  4    1003 2022-07-21 House         Suburb       255674  1491        4
##  5    1004 2022-11-08 Apartment     Suburb       203138  2143        1
##  6    1023 2022-02-25 Apartment     Rural        294306  2177        3
##  7    1006 2022-10-04 House         Rural        235741  1552        2
##  8    1007 2022-01-05 House         Downtown     280516  2036        4
##  9    1008 2022-08-12 Condo         Downtown     220989  1082        2
## 10    1009 2022-12-28 House         Downtown     252248   895        4
## # ℹ 90 more rows
## # ℹ 2 more variables: buyer_income <dbl>, Year <chr>
# confirm duplicates removed
duplicate_test <- unique_ids %>%
  group_by(sale_id)%>%
  summarise(count=n())%>%
  filter(sale_id > 1)%>%
  arrange(desc(count))
print(duplicate_test)
## # A tibble: 100 × 2
##    sale_id count
##      <dbl> <int>
##  1    1000     1
##  2    1001     1
##  3    1002     1
##  4    1003     1
##  5    1004     1
##  6    1005     1
##  7    1006     1
##  8    1007     1
##  9    1008     1
## 10    1009     1
## # ℹ 90 more rows
# How many missing values are present in sq_ft and buyer_income? 
# Replace them with appropriate measures (mean).

# Establish missing values in sq_fit and buyer income
missing_values_sf <- combined_df %>%
  filter(is.na(sq_ft))%>%
  summarise(count=n())
print(missing_values_sf)
## # A tibble: 1 × 1
##   count
##   <int>
## 1    15
missing_values_bi <- combined_df %>%
  filter(is.na(buyer_income))%>%
  summarise(count=n())
print(missing_values_bi)
## # A tibble: 1 × 1
##   count
##   <int>
## 1    15
# remove duplicates
unique_df <- combined_df %>%
  filter(is.na(sale_id))%>%
  summarise(count=n())
print(unique_df)
## # A tibble: 1 × 1
##   count
##   <int>
## 1     0
# calculate the mean sq_ft and buyer income for not null
mean_variables <- combined_df %>%
  filter(!is.na(sq_ft),!is.na(buyer_income))%>%
  summarise(
    avg_sqft=mean(sq_ft),
    avg_income=mean(buyer_income)
    )
print(mean_variables)
## # A tibble: 1 × 2
##   avg_sqft avg_income
##      <dbl>      <dbl>
## 1    1657.     82985.
# replace missing values with mean
combined_df <- combined_df %>%
  mutate(
    sq_ft = ifelse(is.na(sq_ft),'1657',sq_ft),
    buyer_income=ifelse(is.na(buyer_income),'82985',buyer_income)
    )

# Verify the replacement
print(combined_df)
## # A tibble: 300 × 9
##    sale_id sale_date  property_type location sale_price sq_ft bedrooms
##      <dbl> <chr>      <chr>         <chr>         <dbl> <chr>    <dbl>
##  1    1000 2022-03-19 Apartment     Rural        270385 2480         5
##  2    1001 2022-11-16 House         Downtown     271132 1588         5
##  3    1002 2022-07-20 Apartment     Suburb       237756 2271         5
##  4    1003 2022-07-21 House         Suburb       255674 1491         4
##  5    1004 2022-11-08 Apartment     Suburb       203138 2143         1
##  6    1023 2022-02-25 Apartment     Rural        294306 2177         3
##  7    1006 2022-10-04 House         Rural        235741 1552         2
##  8    1007 2022-01-05 House         Downtown     280516 2036         4
##  9    1008 2022-08-12 Condo         Downtown     220989 1082         2
## 10    1009 2022-12-28 House         Downtown     252248 895          4
## # ℹ 290 more rows
## # ℹ 2 more variables: buyer_income <chr>, Year <chr>
#Do buyers in Downtown pay significantly higher prices than those in Suburb? 

anova_result <- aov(combined_df$sale_price~combined_df$location)
summary(anova_result)
##                       Df    Sum Sq   Mean Sq F value Pr(>F)
## combined_df$location   2 4.085e+08 2.042e+08   0.174   0.84
## Residuals            297 3.479e+11 1.171e+09
# visualize results
ggplot(data=combined_df,mapping=aes(x=location,y=sale_price))+
  geom_boxplot(color='black', fill='lightblue')+
  labs(title='Relationship between sale prices and location')

# how does sales price vary by month

#convert date column to month
combined_df <- combined_df %>%
  filter(sale_date !='Invalid Date')%>%
  mutate(
    sale_date <- as.Date(sale_date,format='%Y,%m,%dd'),
    month_sale_date = month(sale_date,label=TRUE, abbr=TRUE)
  ) 

#establish price variance by month and property type
variance <- combined_df %>%
  group_by(month_sale_date)%>%
  summarise(Total_sales=sum(sale_price,na.rm=TRUE),.groups='drop')

#print results
print(variance)
## # A tibble: 12 × 2
##    month_sale_date Total_sales
##    <ord>                 <dbl>
##  1 Jan                 7854935
##  2 Feb                 7297647
##  3 Mar                 4712788
##  4 Apr                 6809031
##  5 May                 5417996
##  6 Jun                 7680867
##  7 Jul                 5493577
##  8 Aug                 6578342
##  9 Sep                 7088732
## 10 Oct                 6057836
## 11 Nov                 8016794
## 12 Dec                 6388169
#visualize results
ggplot(data=variance,mapping=aes(x=month_sale_date,y=Total_sales))+
  geom_col(color='black',fill='orange')+
  labs(title='Sales Price Generated by Month')

#Build a linear regression model to predict sale_price using, bedrooms and buyer income

linear_model <- lm(sale_price~bedrooms+buyer_income,data=combined_df)
print(linear_model)
## 
## Call:
## lm(formula = sale_price ~ bedrooms + buyer_income, data = combined_df)
## 
## Coefficients:
##        (Intercept)            bedrooms  buyer_income100475  buyer_income100567  
##           297274.5              3628.2             -2322.2            -14874.2  
## buyer_income100921  buyer_income101124  buyer_income101125  buyer_income101146  
##           -13621.3            -62040.0            -13345.7             -4580.3  
## buyer_income101278  buyer_income101691  buyer_income102103  buyer_income102151  
##           -11520.8            -35686.0             30402.2            -34115.3  
## buyer_income102547  buyer_income102551  buyer_income103048  buyer_income103203  
##             3696.3            -15858.7            -51520.0            -62371.2  
## buyer_income103801  buyer_income103964  buyer_income104303  buyer_income104357  
##           -29499.8            -49281.7            -17501.0             30807.2  
## buyer_income104386  buyer_income104549  buyer_income104641  buyer_income105309  
##            20371.7             11790.0             10525.7             -8217.2  
## buyer_income105654  buyer_income105755  buyer_income105768  buyer_income106000  
##           -50067.8            -26494.3             -3886.2              9516.0  
## buyer_income106100  buyer_income106315  buyer_income107092  buyer_income107532  
##           -52983.0             14305.0            -39892.0            -89776.3  
## buyer_income108336  buyer_income108549  buyer_income108974  buyer_income110554  
##           -55440.0            -42017.3            -73620.3             11473.8  
## buyer_income110758  buyer_income111266  buyer_income111322  buyer_income111889  
##           -17598.2              5217.8            -43598.2            -26522.7  
## buyer_income113225  buyer_income114754  buyer_income115835  buyer_income116076  
##            31536.2              7944.7             13528.3            -36877.3  
## buyer_income116478  buyer_income117225  buyer_income117724  buyer_income118507  
##           -54252.0             -3128.8            -18804.8            -68473.3  
## buyer_income119512  buyer_income119831   buyer_income50397   buyer_income51407  
##           -25417.3              6432.8           -102732.8            -41450.7  
##  buyer_income51446   buyer_income52523   buyer_income52623   buyer_income52969  
##          -101669.8            -28993.8           -115358.3            -71107.7  
##  buyer_income53085   buyer_income54956   buyer_income54977   buyer_income54995  
##           -18717.3            -99864.0            -78241.2            -83215.8  
##  buyer_income55838   buyer_income56348   buyer_income57119   buyer_income57380  
##           -58754.7            -82944.8            -52226.7            -83744.7  
##  buyer_income58795   buyer_income58960   buyer_income59032   buyer_income59206  
##           -93206.0            -87078.0           -109366.2           -102606.2  
##  buyer_income59787   buyer_income59824   buyer_income60053   buyer_income60166  
##           -62154.2            -98829.8            -22223.2            -39978.7  
##  buyer_income60317   buyer_income60438   buyer_income60782   buyer_income60792  
##           -82664.8            -61064.2            -65459.7            -13853.0  
##  buyer_income61307   buyer_income61357   buyer_income61425   buyer_income61753  
##          -111162.2            -75727.8            -71853.8            -46818.3  
##  buyer_income62068   buyer_income62312   buyer_income62596   buyer_income62686  
##           -23745.8            -47025.0            -44283.3            -45030.3  
##  buyer_income62810   buyer_income62935   buyer_income63042   buyer_income63173  
##           -65811.2            -70187.2            -90245.2             -6712.8  
##  buyer_income63442   buyer_income63522   buyer_income63628   buyer_income63784  
##           -54404.8             16132.3            -42276.3            -33493.8  
##  buyer_income63813   buyer_income64081   buyer_income64573   buyer_income64750  
##              854.8            -32091.8            -30765.8            -60776.3  
##  buyer_income64929   buyer_income65007   buyer_income65028   buyer_income65284  
##           -56334.3            -50679.8            -20135.0            -64425.0  
##  buyer_income65331   buyer_income66158   buyer_income66240   buyer_income66991  
##           -27947.7            -64071.0            -53416.3            -44573.8  
##  buyer_income67698   buyer_income67718   buyer_income67887   buyer_income68092  
##           -36279.0            -77659.3           -100069.0            -48158.3  
##  buyer_income68334   buyer_income68488   buyer_income68813   buyer_income69322  
##           -59114.8            -71495.3            -39790.2            -57008.0  
##  buyer_income69347   buyer_income69348   buyer_income69419   buyer_income69628  
##           -90716.2           -100055.2            -60691.0            -18996.8  
##  buyer_income69636   buyer_income69665   buyer_income69717   buyer_income69878  
##           -35472.0            -65256.7            -51308.3            -83928.2  
##  buyer_income69891   buyer_income70046   buyer_income70377   buyer_income70601  
##           -19815.0             14350.7            -21022.0            -80638.2  
##  buyer_income70612   buyer_income70820   buyer_income70977   buyer_income71219  
##           -21607.7              7099.2            -25201.7            -10679.7  
##  buyer_income71837   buyer_income71892   buyer_income71972   buyer_income72145  
##             8699.2            -45047.3            -59539.2            -36768.3  
##  buyer_income72208   buyer_income72334   buyer_income72360   buyer_income72545  
##           -13698.7            -12307.2              -280.7            -36129.8  
##  buyer_income73008   buyer_income73022   buyer_income73137   buyer_income73393  
##           -34788.0            -59371.8            -19632.2            -79681.3  
##  buyer_income73413   buyer_income73706   buyer_income73715   buyer_income73811  
##           -60904.3            -19207.8            -46360.7            -66510.8  
##  buyer_income73812   buyer_income73990   buyer_income74001   buyer_income74767  
##           -83541.8             -5635.3            -75479.8            -65565.7  
##  buyer_income74779   buyer_income74904   buyer_income74933   buyer_income74947  
##            10079.8            -61837.0            -72234.2            -12245.8  
##  buyer_income75067   buyer_income75186   buyer_income75245   buyer_income75439  
##             -137.0            -60075.8            -94457.3            -77705.3  
##  buyer_income75475   buyer_income76059   buyer_income76409   buyer_income76479  
##           -76964.3            -63703.3            -49190.0            -45933.8  
##  buyer_income76514   buyer_income76616   buyer_income76917   buyer_income76993  
##           -54321.2            -47513.8            -18026.8            -68102.0  
##  buyer_income77373   buyer_income77421   buyer_income77486   buyer_income77778  
##           -35958.0            -22462.8            -37898.8            -65623.3  
##  buyer_income77987   buyer_income78275   buyer_income78476   buyer_income78552  
##             5664.3             -9055.0            -49324.2            -57545.8  
##  buyer_income78673   buyer_income78763   buyer_income78818   buyer_income79280  
##           -79678.8            -10924.2             21589.8             -6242.8  
##  buyer_income79555   buyer_income79805   buyer_income79814   buyer_income79863  
##           -31271.2            -28828.8            -83564.2            -83015.7  
##  buyer_income79915   buyer_income80241   buyer_income80396   buyer_income80449  
##           -30188.0            -96644.3            -26343.3              7996.2  
##  buyer_income81284   buyer_income81374   buyer_income81470   buyer_income81506  
##           -77872.0             -3033.8            -72807.7            -41361.3  
##  buyer_income81637   buyer_income81854   buyer_income82530   buyer_income82619  
##            -4117.7              8729.7            -59963.8            -32833.7  
##  buyer_income82774   buyer_income82806   buyer_income82923   buyer_income82985  
##           -59735.0            -53936.8            -44611.0            -32759.2  
##  buyer_income83079   buyer_income83120   buyer_income83782   buyer_income84345  
##           -67921.7            -51182.7            -49184.3             -4957.3  
##  buyer_income84523   buyer_income84992   buyer_income85056   buyer_income85306  
##             -792.0            -23146.2            -20049.7             13941.7  
##  buyer_income85325   buyer_income85672   buyer_income85674   buyer_income85784  
##           -18100.2            -54829.8            -45980.0             -1578.2  
##  buyer_income86007   buyer_income86252   buyer_income86392   buyer_income86434  
##            -4834.8              6465.8            -17936.7             28119.0  
##  buyer_income86628   buyer_income88171   buyer_income88181   buyer_income88369  
##            29259.3            -70923.3            -41638.8              2915.2  
##  buyer_income88509   buyer_income88565   buyer_income88635   buyer_income88762  
##          -108094.0            -76137.2            -69976.2             -7712.2  
##  buyer_income88875   buyer_income89218   buyer_income89264   buyer_income89310  
##             4305.3             35365.3            -24894.2            -83037.7  
##  buyer_income89952   buyer_income90065   buyer_income90298   buyer_income90330  
##            25509.8            -39414.0             19230.0           -106837.0  
##  buyer_income90546   buyer_income90599   buyer_income90674   buyer_income90927  
##           -26683.3            -56722.7            -50744.7            -36182.7  
##  buyer_income91061   buyer_income91072   buyer_income91210   buyer_income91920  
##           -15612.0            -65046.0            -13990.8             17868.7  
##  buyer_income92007   buyer_income92009   buyer_income92024   buyer_income92039  
##           -76358.3            -76389.2            -64141.8            -31170.0  
##  buyer_income92583   buyer_income92601   buyer_income92859   buyer_income92972  
##           -41772.2            -29248.0            -12515.2            -27389.2  
##  buyer_income93059   buyer_income93153   buyer_income93679   buyer_income93696  
##           -32617.0            -11719.7             11483.8            -25461.0  
##  buyer_income93828   buyer_income93929   buyer_income93945   buyer_income94050  
##           -60037.7              6758.8            -13915.3            -31200.8  
##  buyer_income94224   buyer_income94291   buyer_income94477   buyer_income94737  
##          -104200.0            -36332.3            -28225.7            -21519.8  
##  buyer_income95412   buyer_income95570   buyer_income95644   buyer_income96382  
##           -21511.0              -721.8            -56113.2            -19222.0  
##  buyer_income96661   buyer_income96996   buyer_income97333   buyer_income97395  
##           -27691.0            -38607.3            -93839.0            -27008.8  
##  buyer_income97434   buyer_income97576   buyer_income97605   buyer_income97949  
##           -15183.2            -94931.7            -66459.7            -63826.3  
##  buyer_income98058   buyer_income98142   buyer_income98149   buyer_income98191  
##           -30900.2             15732.7            -71294.3            -85053.3  
##  buyer_income98244   buyer_income98297   buyer_income98861   buyer_income99023  
##           -68789.8             30228.2              1635.0            -37905.7  
##  buyer_income99474   buyer_income99894   buyer_income99897   buyer_income99980  
##           -73767.3            -20291.3            -48999.0             23698.2
#Identify and visualize outliers in sale_price 

#calculate IQR
price_iQr <- IQR (combined_df$sale_price, na.rm=TRUE)
q1_price <- quantile(combined_df$sale_price,0.25,na.rm=TRUE)
q2_price <- quantile(combined_df$sale_price,0.75,na.rm=TRUE)

#define lower and upper bounds
lower_bound_price <- q1_price -1.5 * price_iQr
upper_bound_price <- q2_price +1.5 * price_iQr

#identify outliers
price_outliers <- combined_df$sale_price <lower_bound_price | combined_df$sale_price > upper_bound_price

#print result
outlier_values <- combined_df$sale_price[price_outliers]
print(outlier_values)
## numeric(0)
# what is the property sales price distribution by location

ggplot(data=combined_df,mapping=aes(x=location,y=sale_price,fill=location))+
  geom_boxplot(color='black')+
  labs(title='Property sales price distribution by location')

#how does the rate of income vary by location

#convert buyer income from character to numeric
combined_df$buyer_income <- as.numeric(combined_df$buyer_income)

#sum and mutate to establish rate
income_location <- combined_df %>%
  group_by(location)%>%
  summarise(
    total_income = sum(buyer_income,na.rm=TRUE),
  ) %>%
mutate(
  income_rate=(total_income/sum(total_income)) * 100
)

#print results
 print(income_location)
## # A tibble: 3 × 3
##   location total_income income_rate
##   <chr>           <dbl>       <dbl>
## 1 Downtown      8616570        35.3
## 2 Rural         8627064        35.4
## 3 Suburb        7144350        29.3
#visualize results
 ggplot(data=income_location,mapping=aes(x=location,y=income_rate))+
   geom_col(color='black',fill='brown')+
   labs(title='Rate of income by location')