#Load necessary libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!requireNamespace("tidyverse", quietly = TRUE)) {
install.packages("tidyverse")
}
# tidyverse is a collection of R packages designed for data science, including:
#
# ggplot2 for data visualization.
# dplyr for data manipulation.
# Load the dataset
data <- read_csv("C:\\Users\\A Sanjay\\OneDrive\\Desktop\\STA 635 Project\\archive (6)\\data.csv")
## Rows: 4600 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): street, city, statezip, country
## dbl (13): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read_csv: Reads the CSV file containing the dataset into an R data frame.
# head: Displays the first few rows of the dataset to verify it was loaded correctly.
# Display the first few rows of the dataset
print(head(data))
## # A tibble: 6 × 18
## date price bedrooms bathrooms sqft_living sqft_lot floors
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014-05-02 00:00:00 313000 3 1.5 1340 7912 1.5
## 2 2014-05-02 00:00:00 2384000 5 2.5 3650 9050 2
## 3 2014-05-02 00:00:00 342000 3 2 1930 11947 1
## 4 2014-05-02 00:00:00 420000 3 2.25 2000 8030 1
## 5 2014-05-02 00:00:00 550000 4 2.5 1940 10500 1
## 6 2014-05-02 00:00:00 490000 2 1 880 6380 1
## # ℹ 11 more variables: waterfront <dbl>, view <dbl>, condition <dbl>,
## # sqft_above <dbl>, sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>,
## # street <chr>, city <chr>, statezip <chr>, country <chr>
# Summary statistics
summary_stats <- summary(data)
print(summary_stats)
## date price bedrooms
## Min. :2014-05-02 00:00:00.00 Min. : 0 Min. :0.000
## 1st Qu.:2014-05-21 00:00:00.00 1st Qu.: 322875 1st Qu.:3.000
## Median :2014-06-09 00:00:00.00 Median : 460943 Median :3.000
## Mean :2014-06-07 03:14:42.77 Mean : 551963 Mean :3.401
## 3rd Qu.:2014-06-24 00:00:00.00 3rd Qu.: 654962 3rd Qu.:4.000
## Max. :2014-07-10 00:00:00.00 Max. :26590000 Max. :9.000
## bathrooms sqft_living sqft_lot floors
## Min. :0.000 Min. : 370 Min. : 638 Min. :1.000
## 1st Qu.:1.750 1st Qu.: 1460 1st Qu.: 5001 1st Qu.:1.000
## Median :2.250 Median : 1980 Median : 7683 Median :1.500
## Mean :2.161 Mean : 2139 Mean : 14852 Mean :1.512
## 3rd Qu.:2.500 3rd Qu.: 2620 3rd Qu.: 11001 3rd Qu.:2.000
## Max. :8.000 Max. :13540 Max. :1074218 Max. :3.500
## waterfront view condition sqft_above
## Min. :0.000000 Min. :0.0000 Min. :1.000 Min. : 370
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:1190
## Median :0.000000 Median :0.0000 Median :3.000 Median :1590
## Mean :0.007174 Mean :0.2407 Mean :3.452 Mean :1827
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.:2300
## Max. :1.000000 Max. :4.0000 Max. :5.000 Max. :9410
## sqft_basement yr_built yr_renovated street
## Min. : 0.0 Min. :1900 Min. : 0.0 Length:4600
## 1st Qu.: 0.0 1st Qu.:1951 1st Qu.: 0.0 Class :character
## Median : 0.0 Median :1976 Median : 0.0 Mode :character
## Mean : 312.1 Mean :1971 Mean : 808.6
## 3rd Qu.: 610.0 3rd Qu.:1997 3rd Qu.:1999.0
## Max. :4820.0 Max. :2014 Max. :2014.0
## city statezip country
## Length:4600 Length:4600 Length:4600
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
# summary: Provides summary statistics (mean, median, min, max, quartiles) for each column in the dataset.
#Summary Statistics:
# Mean, Median, Min, Max, Standard Deviation: These provide an overview of the data distribution for each feature.
# For example:
# Price: Mean = $551,963, which indicates the average house price.
# Bedrooms: Mean = 3.4, indicating that most houses have around 3 to 4 bedrooms.
# Bathrooms: Mean = 2.16, showing that most houses have between 2 and 3 bathrooms.
# Sqft Living: Mean = 2139 sqft, indicating the average living area size.
# Check for missing values
missing_values <- colSums(is.na(data))
print(missing_values)
## date price bedrooms bathrooms sqft_living
## 0 0 0 0 0
## sqft_lot floors waterfront view condition
## 0 0 0 0 0
## sqft_above sqft_basement yr_built yr_renovated street
## 0 0 0 0 0
## city statezip country
## 0 0 0
# colSums(is.na(data)): Checks for missing values by summing the NA values in each column.
# Missing Values:
# No missing values: This is good because it means we don't need to handle any missing data before proceeding with the analysis.
# Exploratory Data Analysis (EDA)
# Histograms for key features
data %>%
gather(key = "variable", value = "value", price, bedrooms, bathrooms, sqft_living, floors, yr_built) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
facet_wrap(~variable, scales = "free_x") +
theme_minimal()

# gather: Converts wide-format data to long-format, which is useful for plotting multiple histograms in a single graph.
# ggplot: Initializes the plot.
# aes(x = value): Maps the value to the x-axis.
# geom_histogram: Creates histograms.
# facet_wrap: Creates separate plots for each variable.
# theme_minimal: Applies a minimal theme to the plot.
# Histograms:
# Distributions:
# Price Distribution: Right-skewed, indicating a few very expensive properties.
# Bedrooms and Bathrooms: Most properties have 3-4 bedrooms and 2-3 bathrooms.
# Sqft Living: Peaks around 1500-2500 sqft, indicating the common range for living space.
# Floors and Year Built: Most properties have 1-2 floors and were built between 1950-1980.
# Scatter plots to examine relationships with price
ggplot(data, aes(x = sqft_living, y = price)) +
geom_point(color = "blue") +
theme_minimal() +
ggtitle("Price vs. Sqft Living")

ggplot(data, aes(x = bedrooms, y = price)) +
geom_point(color = "blue") +
theme_minimal() +
ggtitle("Price vs. Bedrooms")

ggplot(data, aes(x = bathrooms, y = price)) +
geom_point(color = "blue") +
theme_minimal() +
ggtitle("Price vs. Bathrooms")

# Scatter Plots: Visualize the relationships between price and other features (sqft_living, bedrooms, bathrooms).
# geom_point: Adds points to the scatter plot.
# ggtitle: Adds a title to each plot.
# Scatter Plots:
#
# Price vs. Sqft Living: Positive correlation; larger houses tend to have higher prices.
# Price vs. Bedrooms/Bathrooms: Some positive correlation, but less pronounced than sqft living.
# Correlation matrix
correlation_matrix <- data %>%
select_if(is.numeric) %>%
cor()
# Correlation Matrix:
# High Correlations:
# Sqft Living and Price: Strong positive correlation, indicating that as the living area increases, the price tends to increase.
# Sqft Above and Price: Also a strong positive correlation.
# Other Features: Waterfront, view, and condition also show some correlation with price.
# Plot the heatmap
correlation_matrix %>%
as.data.frame() %>%
rownames_to_column(var = "Var1") %>%
gather(key = "Var2", value = "value", -Var1) %>%
ggplot(aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2))) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) +
coord_fixed()

# Correlation Matrix: Calculates and visualizes the correlation between numeric features.
# select_if(is.numeric): Selects only numeric columns.
# cor: Computes the correlation matrix.
# geom_tile: Creates a heatmap.
# geom_text: Adds correlation values as text.
# scale_fill_gradient2: Sets the color gradient for the heatmap.
# Building the linear regression model
model <- lm(price ~ bedrooms + bathrooms + sqft_living + floors + waterfront + view + condition + sqft_above + sqft_basement + yr_built + yr_renovated, data = data)
summary(model)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + floors +
## waterfront + view + condition + sqft_above + sqft_basement +
## yr_built + yr_renovated, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2105681 -128343 -16711 89700 26335249
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.609e+06 6.859e+05 6.720 2.04e-11 ***
## bedrooms -5.618e+04 1.049e+04 -5.356 8.92e-08 ***
## bathrooms 5.919e+04 1.701e+04 3.479 0.000508 ***
## sqft_living 2.292e+02 2.169e+01 10.565 < 2e-16 ***
## floors 4.598e+04 1.863e+04 2.468 0.013606 *
## waterfront 3.605e+05 9.386e+04 3.841 0.000124 ***
## view 4.480e+04 1.097e+04 4.082 4.54e-05 ***
## condition 3.125e+04 1.306e+04 2.393 0.016741 *
## sqft_above 2.208e+01 2.149e+01 1.027 0.304248
## sqft_basement NA NA NA NA
## yr_built -2.395e+03 3.419e+02 -7.006 2.82e-12 ***
## yr_renovated 6.772e+00 8.643e+00 0.784 0.433354
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 499800 on 4589 degrees of freedom
## Multiple R-squared: 0.216, Adjusted R-squared: 0.2143
## F-statistic: 126.4 on 10 and 4589 DF, p-value: < 2.2e-16
# lm: Fits a linear regression model.
# price ~ ...: Specifies the response variable (price) and the predictor variables (bedrooms, bathrooms, etc.).
# summary: Provides detailed information about the fitted model, including coefficients, R-squared value, and p-values.
# Linear Regression Model:
#
# Coefficients: Provide the estimated change in price for a one-unit change in each feature, holding all other features constant.
# Positive Coefficients:
# Sqft Living: For each additional square foot, the price increases by a certain amount.
# Bedrooms/Bathrooms: Positive but smaller compared to sqft living.
# Negative Coefficients:
# Year Built: Older houses tend to be cheaper.
# Intercept: The baseline price when all features are zero.
# Model evaluation
# Predicting on the dataset
predictions <- predict(model, data)
# Model Evaluation:
#
# R-squared:
# Value: Measures the proportion of variance in the dependent variable (price) explained by the independent variables (features).
# Example: An R-squared value of 0.65 indicates that 65% of the variability in house prices can be explained by the model.
# Calculating R-squared and Mean Squared Error
r_squared <- summary(model)$r.squared
mse <- mean((data$price - predictions)^2)
# Mean Squared Error (MSE):
# Value: The average of the squares of the errors (difference between actual and predicted prices).
# Example: An MSE of a certain value indicates the average squared difference between observed and predicted house prices.
# Printing the evaluation metrics
print(paste("R-squared: ", r_squared))
## [1] "R-squared: 0.215998743639482"
print(paste("Mean Squared Error: ", mse))
## [1] "Mean Squared Error: 249187320761.844"
# predict: Uses the fitted model to predict price for the dataset.
# R-squared: Measures the proportion of variance in the dependent variable that is predictable from the independent variables.
# Mean Squared Error (MSE): Measures the average squared difference between observed and predicted values.
# Summary of Findings
# Most Influential Features:
# Sqft Living: Strongest predictor of house price.
# Bedrooms and Bathrooms: Also important but less than living space.
# Waterfront and View: Premium features that significantly affect price.
# Year Built: Older homes tend to have lower prices.
# Model Performance:
# R-squared: Indicates a good fit, meaning the model explains a significant portion of the variability in house prices.
# MSE: Provides a measure of the prediction error.
# Practical Implications:
# For Home Buyers: Understanding which features most influence price can guide purchasing decisions.
# For Home Sellers: Highlighting features like living space, waterfront, and view can help justify higher prices.
# For Real Estate Agents: Insights can aid in pricing strategies and advising clients on property improvements.
# Summary of Packages and Functions Used
# tidyverse: A collection of packages for data science tasks.
# read_csv: Reads CSV files.
# ggplot2: Creates graphics and visualizations.
# dplyr: Provides functions for data manipulation (e.g., select_if, summarize).
# tidyr: Helps in tidying data (e.g., gather).
# lm: Fits linear models.
# predict: Makes predictions based on the fitted model.
# summary: Provides summary statistics for data and models.
#
# cor: Computes correlation matrices.
# write_csv: Writes data to CSV files.
# Interpretation of the Results
# Summary Statistics and Missing Values: Confirm that the data is complete and provide an overview of the key features.
# EDA: Visualize data distributions and relationships between variables to identify patterns and insights.
# Correlation Matrix: Understand the relationships between numeric variables and identify potential multicollinearity.
# Linear Regression Model: Quantify the effect of each predictor on the response variable (price).
# Model Evaluation: Assess model performance using R-squared and MSE.
# Predictions: Save the predicted values for further analysis or reporting.
# Save the results
write_csv(data.frame(predictions), "predictions.csv")