Establishing a connection to the sql database
config <- yaml::read_yaml("config.yaml")
con <- dbConnect(
RMySQL::MySQL(),
dbname = config$dbname,
host = config$host,
port = config$port,
user = config$user,
password = config$password
)
Store each table in the database in a dataframe
## 2018
query1 <- "SELECT * FROM 607final.2018_property_sales"
property_sales_2018 <- dbGetQuery(con, query1)
## 2019
query2 <- "SELECT * FROM 607final.2019_property_sales"
property_sales_2019 <- dbGetQuery(con, query2)
## 2020
query3 <- "SELECT * FROM 607final.2020_property_sales"
property_sales_2020 <- dbGetQuery(con, query3)
## 2022
query4 <- "SELECT * FROM 607final.2022_property_sales"
property_sales_2022 <- dbGetQuery(con, query4)
Store the 2021 csv dataframe in mongoddb atlas
uri <- "mongodb+srv://bobbtilon:I8aGyfSubO4lteBf@cluster0.t25lesh.mongodb.net/cluster0?retryWrites=true&w=majority"
conn <- mongo(collection = "final", db = "cluster0", url = uri)
property_sales_2021 <-conn$find()
If connection to mongodb doesn’t work do the following and delete the above codechunk
## url <- "https://raw.githubusercontent.com/Kingtilon1/DATA607/main/Final/2021_property_sales.csv"
## property_sales_2021 <- read.csv(url)
My hypothesis is that there will be a consistent increase throughout the years
names(property_sales_2022)[names(property_sales_2022) == "PropertyID"] <- "PropertyID"
clean_and_standardize <- function(df) {
df$Sale_price <- as.numeric(gsub("[\\$,]", "", df$Sale_price))
return(df)
}
# Assuming property_sales_2020 and property_sales_2021 are your data frames
property_sales_2020 <- clean_and_standardize(property_sales_2020)
property_sales_2021 <- clean_and_standardize(property_sales_2021)
The dataframe encompasses information on 886 properties, including categorical variables such as property type (PropType), address, and various descriptors like condominium projects (CondoProject), districts (District), neighborhoods (Nbhd), and architectural styles (Style). Numerical variables like tax keys (Taxkey) range from 1.002e+07 to 2.141e+09, with a mean value of approximately 1.394e+09. Notable summary statistics reveal a diversity of property characteristics, including the number of stories (Stories), year of construction (Year_Built), room count (Nr_of_rms), square footage (Fin_sqft), number of bedrooms (Bdrms), bathrooms (Fbath, Hbath), lot size (Lotsize), and sale price (Sale_price). Of particular interest, the sale price varies from a minimum of $2,000 to a maximum of $6,800,000, with a median value of $103,750, indicating a wide range of property values in the dataset.
summary(property_sales_2018)
## PropType Taxkey Address CondoProject
## Length:886 Min. :1.002e+07 Length:886 Length:886
## Class :character 1st Qu.:1.040e+09 Class :character Class :character
## Mode :character Median :1.480e+09 Mode :character Mode :character
## Mean :1.394e+09
## 3rd Qu.:1.880e+09
## Max. :2.141e+09
## District Nbhd Style Extwall
## Min. :1.00 Min. : 40 Length:886 Length:886
## 1st Qu.:2.00 1st Qu.: 560 Class :character Class :character
## Median :5.00 Median :1040 Mode :character Mode :character
## Mean :5.13 Mean :1917
## 3rd Qu.:9.00 3rd Qu.:1440
## Max. :9.00 Max. :6970
## Stories Year_Built Nr_of_rms Fin_sqft Units
## Min. :0.000 Min. : 0 Min. :0.000 Min. : 0 Min. :0.0
## 1st Qu.:1.000 1st Qu.:1954 1st Qu.:0.000 1st Qu.: 1024 1st Qu.:1.0
## Median :1.000 Median :1960 Median :0.000 Median : 1200 Median :1.0
## Mean :1.275 Mean :1932 Mean :0.693 Mean : 2446 Mean :1.1
## 3rd Qu.:1.875 3rd Qu.:1971 3rd Qu.:0.000 3rd Qu.: 1630 3rd Qu.:1.0
## Max. :2.500 Max. :2018 Max. :7.000 Max. :145748 Max. :3.0
## Bdrms Fbath Hbath Lotsize
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. : 0
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 5120
## Median :3.000 Median :1.000 Median :0.0000 Median : 7200
## Mean :2.944 Mean :1.246 Mean :0.4718 Mean : 12952
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.: 9600
## Max. :6.000 Max. :4.000 Max. :2.0000 Max. :647650
## Sale_date Sale_price
## Length:886 Min. : 2000
## Class :character 1st Qu.: 65000
## Mode :character Median : 103750
## Mean : 169412
## 3rd Qu.: 140675
## Max. :6800000
We see that the majority of sales occurred at around 15k which can account for things like apartments and condos to around 200,000 with a few outliers going to the million mark, and based off of the summary statistics, a sale at 6 million, but we limited the view from 0 to 1 million to see the distribution clearly.
ggplot(property_sales_2018, aes(x = Sale_price)) +
geom_histogram(binwidth = 10000, fill = "green", color = "black", alpha = 0.7) +
scale_x_continuous(labels = scales::comma, limits = c(0, 1000000)) +
labs(title = "Distribution of Sale Prices in 2018",
x = "Sale Price",
y = "Count") +
theme_minimal()
## Warning: Removed 15 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
# Create a list of data frames
data_frames_list <- lapply(2018:2022, function(year) {
df_name <- paste0("property_sales_", year)
get(df_name)
})
# Function to standardize column types
standardize_types <- function(df) {
# Convert all columns to character
df[] <- lapply(df, as.character)
return(df)
}
# Combine the data frames into a single data frame
combined_df <- bind_rows(lapply(data_frames_list, standardize_types), .id = "Year")
# Calculate the average sale price for each year and convert to numeric
average_prices <- combined_df %>%
group_by(Year) %>%
summarise(avg_sale_price = mean(as.numeric(Sale_price), na.rm = TRUE))
combined_df <- merge(combined_df, average_prices, by = "Year")
# Create a line chart with customized x-axis labels
ggplot(average_prices, aes(x = Year, y = avg_sale_price, color = Year, group = 1)) +
geom_line() +
ggtitle("Average Sale Prices Over Years") +
theme_minimal() +
scale_x_discrete(labels = c("2018", "2019", "2020", "2021", "2022"))
Looking at the line plot we see that from 2018 to 2019 the average price
jumped from $169,411 to $258,378 dollars, a 52.61% increase, and from
2019 to 2020 ($207,131) we see a 19.9% decrease, then from 2020 to
2021($237,485) we have a 14.66% increase and then from 2021 to
2022($271,545) we have a 14.33% increase
Lets use a linear regression model to explore the trend to confirm if there is an upward trend
model <- lm(avg_sale_price ~ Year, data = combined_df)
# Display the summary of the regression model
summary(model)
##
## Call:
## lm(formula = avg_sale_price ~ Year, data = combined_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.061e-06 0.000e+00 0.000e+00 0.000e+00 2.201e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.694e+05 4.599e-09 3.684e+13 <2e-16 ***
## Year2 8.897e+04 4.917e-09 1.809e+13 <2e-16 ***
## Year3 3.772e+04 4.910e-09 7.682e+12 <2e-16 ***
## Year4 7.188e+04 4.902e-09 1.466e+13 <2e-16 ***
## Year5 1.021e+05 4.876e-09 2.095e+13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.369e-07 on 27056 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.702e+26 on 4 and 27056 DF, p-value: < 2.2e-16
# Visualize the regression line
ggplot(average_prices, aes(x = Year, y = avg_sale_price)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
ggtitle("Regression Line of Average Sale Prices Over Years") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
In this linear regression analysis, we fitted a model to explore the relationship between the year and average sale prices. The coefficients for ‘Year2’, ‘Year3’, ‘Year4’, and ‘Year5’ were found to be highly statistically significant (p-value < 0.001), indicating an upward trend in average sale prices over the years. The high R-squared values close to 1 suggest that the model effectively explains a substantial proportion of the variance in average sale prices based on the ‘Year’ variable. Overall, the results provide compelling evidence for a significant and positive trend in average sale prices from 2018 to 2022, implying a consistent increase in property values over this period. This confirms my hypothessi that there