#Load necessary libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!requireNamespace("tidyverse", quietly = TRUE)) {
  install.packages("tidyverse")
  
}

# tidyverse is a collection of R packages designed for data science, including:
# 
# ggplot2 for data visualization.
# dplyr for data manipulation.
# Load the dataset
data <- read_csv("C:\\Users\\A Sanjay\\OneDrive\\Desktop\\STA 635 Project\\archive (6)\\data.csv")
## Rows: 4600 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): street, city, statezip, country
## dbl  (13): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read_csv: Reads the CSV file containing the dataset into an R data frame.
# head: Displays the first few rows of the dataset to verify it was loaded correctly.

# Display the first few rows of the dataset
print(head(data))
## # A tibble: 6 × 18
##   date                  price bedrooms bathrooms sqft_living sqft_lot floors
##   <dttm>                <dbl>    <dbl>     <dbl>       <dbl>    <dbl>  <dbl>
## 1 2014-05-02 00:00:00  313000        3      1.5         1340     7912    1.5
## 2 2014-05-02 00:00:00 2384000        5      2.5         3650     9050    2  
## 3 2014-05-02 00:00:00  342000        3      2           1930    11947    1  
## 4 2014-05-02 00:00:00  420000        3      2.25        2000     8030    1  
## 5 2014-05-02 00:00:00  550000        4      2.5         1940    10500    1  
## 6 2014-05-02 00:00:00  490000        2      1            880     6380    1  
## # ℹ 11 more variables: waterfront <dbl>, view <dbl>, condition <dbl>,
## #   sqft_above <dbl>, sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>,
## #   street <chr>, city <chr>, statezip <chr>, country <chr>
# Summary statistics
summary_stats <- summary(data)
print(summary_stats)
##       date                            price             bedrooms    
##  Min.   :2014-05-02 00:00:00.00   Min.   :       0   Min.   :0.000  
##  1st Qu.:2014-05-21 00:00:00.00   1st Qu.:  322875   1st Qu.:3.000  
##  Median :2014-06-09 00:00:00.00   Median :  460943   Median :3.000  
##  Mean   :2014-06-07 03:14:42.77   Mean   :  551963   Mean   :3.401  
##  3rd Qu.:2014-06-24 00:00:00.00   3rd Qu.:  654962   3rd Qu.:4.000  
##  Max.   :2014-07-10 00:00:00.00   Max.   :26590000   Max.   :9.000  
##    bathrooms      sqft_living       sqft_lot           floors     
##  Min.   :0.000   Min.   :  370   Min.   :    638   Min.   :1.000  
##  1st Qu.:1.750   1st Qu.: 1460   1st Qu.:   5001   1st Qu.:1.000  
##  Median :2.250   Median : 1980   Median :   7683   Median :1.500  
##  Mean   :2.161   Mean   : 2139   Mean   :  14852   Mean   :1.512  
##  3rd Qu.:2.500   3rd Qu.: 2620   3rd Qu.:  11001   3rd Qu.:2.000  
##  Max.   :8.000   Max.   :13540   Max.   :1074218   Max.   :3.500  
##    waterfront            view          condition       sqft_above  
##  Min.   :0.000000   Min.   :0.0000   Min.   :1.000   Min.   : 370  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:1190  
##  Median :0.000000   Median :0.0000   Median :3.000   Median :1590  
##  Mean   :0.007174   Mean   :0.2407   Mean   :3.452   Mean   :1827  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:2300  
##  Max.   :1.000000   Max.   :4.0000   Max.   :5.000   Max.   :9410  
##  sqft_basement       yr_built     yr_renovated       street         
##  Min.   :   0.0   Min.   :1900   Min.   :   0.0   Length:4600       
##  1st Qu.:   0.0   1st Qu.:1951   1st Qu.:   0.0   Class :character  
##  Median :   0.0   Median :1976   Median :   0.0   Mode  :character  
##  Mean   : 312.1   Mean   :1971   Mean   : 808.6                     
##  3rd Qu.: 610.0   3rd Qu.:1997   3rd Qu.:1999.0                     
##  Max.   :4820.0   Max.   :2014   Max.   :2014.0                     
##      city             statezip           country         
##  Length:4600        Length:4600        Length:4600       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 
# summary: Provides summary statistics (mean, median, min, max, quartiles) for each column in the dataset.


#Summary Statistics:
# Mean, Median, Min, Max, Standard Deviation: These provide an overview of the data distribution for each feature.
# For example:
# Price: Mean = $551,963, which indicates the average house price.
# Bedrooms: Mean = 3.4, indicating that most houses have around 3 to 4 bedrooms.
# Bathrooms: Mean = 2.16, showing that most houses have between 2 and 3 bathrooms.
# Sqft Living: Mean = 2139 sqft, indicating the average living area size.

# Check for missing values
missing_values <- colSums(is.na(data))
print(missing_values)
##          date         price      bedrooms     bathrooms   sqft_living 
##             0             0             0             0             0 
##      sqft_lot        floors    waterfront          view     condition 
##             0             0             0             0             0 
##    sqft_above sqft_basement      yr_built  yr_renovated        street 
##             0             0             0             0             0 
##          city      statezip       country 
##             0             0             0
# colSums(is.na(data)): Checks for missing values by summing the NA values in each column.

# Missing Values:
# No missing values: This is good because it means we don't need to handle any missing data before proceeding with the analysis.

# Exploratory Data Analysis (EDA)
# Histograms for key features
data %>%
  gather(key = "variable", value = "value", price, bedrooms, bathrooms, sqft_living, floors, yr_built) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~variable, scales = "free_x") +
  theme_minimal()

# gather: Converts wide-format data to long-format, which is useful for plotting multiple histograms in a single graph.
# ggplot: Initializes the plot.
# aes(x = value): Maps the value to the x-axis.
# geom_histogram: Creates histograms.
# facet_wrap: Creates separate plots for each variable.
# theme_minimal: Applies a minimal theme to the plot.

# Histograms:
 
# Distributions:
# Price Distribution: Right-skewed, indicating a few very expensive properties.
# Bedrooms and Bathrooms: Most properties have 3-4 bedrooms and 2-3 bathrooms.
# Sqft Living: Peaks around 1500-2500 sqft, indicating the common range for living space.
# Floors and Year Built: Most properties have 1-2 floors and were built between 1950-1980.

# Scatter plots to examine relationships with price
ggplot(data, aes(x = sqft_living, y = price)) +
  geom_point(color = "blue") +
  theme_minimal() +
  ggtitle("Price vs. Sqft Living")

ggplot(data, aes(x = bedrooms, y = price)) +
  geom_point(color = "blue") +
  theme_minimal() +
  ggtitle("Price vs. Bedrooms")

ggplot(data, aes(x = bathrooms, y = price)) +
  geom_point(color = "blue") +
  theme_minimal() +
  ggtitle("Price vs. Bathrooms")

# Scatter Plots: Visualize the relationships between price and other features (sqft_living, bedrooms, bathrooms).
# geom_point: Adds points to the scatter plot.
# ggtitle: Adds a title to each plot.

# Scatter Plots:
# 
# Price vs. Sqft Living: Positive correlation; larger houses tend to have higher prices.
# Price vs. Bedrooms/Bathrooms: Some positive correlation, but less pronounced than sqft living.

# Correlation matrix
correlation_matrix <- data %>%
  select_if(is.numeric) %>%
  cor()

# Correlation Matrix:
 
# High Correlations:
# Sqft Living and Price: Strong positive correlation, indicating that as the living area increases, the price tends to increase.
# Sqft Above and Price: Also a strong positive correlation.
# Other Features: Waterfront, view, and condition also show some correlation with price.

# Plot the heatmap
correlation_matrix %>%
  as.data.frame() %>%
  rownames_to_column(var = "Var1") %>%
  gather(key = "Var2", value = "value", -Var1) %>%
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(label = round(value, 2))) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) +
  coord_fixed()

# Correlation Matrix: Calculates and visualizes the correlation between numeric features.
# select_if(is.numeric): Selects only numeric columns.
# cor: Computes the correlation matrix.
# geom_tile: Creates a heatmap.
# geom_text: Adds correlation values as text.
# scale_fill_gradient2: Sets the color gradient for the heatmap.

# Building the linear regression model
model <- lm(price ~ bedrooms + bathrooms + sqft_living + floors + waterfront + view + condition + sqft_above + sqft_basement + yr_built + yr_renovated, data = data)
summary(model)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + floors + 
##     waterfront + view + condition + sqft_above + sqft_basement + 
##     yr_built + yr_renovated, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2105681  -128343   -16711    89700 26335249 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.609e+06  6.859e+05   6.720 2.04e-11 ***
## bedrooms      -5.618e+04  1.049e+04  -5.356 8.92e-08 ***
## bathrooms      5.919e+04  1.701e+04   3.479 0.000508 ***
## sqft_living    2.292e+02  2.169e+01  10.565  < 2e-16 ***
## floors         4.598e+04  1.863e+04   2.468 0.013606 *  
## waterfront     3.605e+05  9.386e+04   3.841 0.000124 ***
## view           4.480e+04  1.097e+04   4.082 4.54e-05 ***
## condition      3.125e+04  1.306e+04   2.393 0.016741 *  
## sqft_above     2.208e+01  2.149e+01   1.027 0.304248    
## sqft_basement         NA         NA      NA       NA    
## yr_built      -2.395e+03  3.419e+02  -7.006 2.82e-12 ***
## yr_renovated   6.772e+00  8.643e+00   0.784 0.433354    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 499800 on 4589 degrees of freedom
## Multiple R-squared:  0.216,  Adjusted R-squared:  0.2143 
## F-statistic: 126.4 on 10 and 4589 DF,  p-value: < 2.2e-16
# lm: Fits a linear regression model.
# price ~ ...: Specifies the response variable (price) and the predictor variables (bedrooms, bathrooms, etc.).
# summary: Provides detailed information about the fitted model, including coefficients, R-squared value, and p-values.

# Linear Regression Model:
# 
# Coefficients: Provide the estimated change in price for a one-unit change in each feature, holding all other features constant.
# Positive Coefficients:
# Sqft Living: For each additional square foot, the price increases by a certain amount.
# Bedrooms/Bathrooms: Positive but smaller compared to sqft living.
# Negative Coefficients:
# Year Built: Older houses tend to be cheaper.
# Intercept: The baseline price when all features are zero.

# Model evaluation
# Predicting on the dataset
predictions <- predict(model, data)

# Model Evaluation:
# 
# R-squared:
# Value: Measures the proportion of variance in the dependent variable (price) explained by the independent variables (features).
# Example: An R-squared value of 0.65 indicates that 65% of the variability in house prices can be explained by the model.

# Calculating R-squared and Mean Squared Error
r_squared <- summary(model)$r.squared
mse <- mean((data$price - predictions)^2)

# Mean Squared Error (MSE):
# Value: The average of the squares of the errors (difference between actual and predicted prices).
# Example: An MSE of a certain value indicates the average squared difference between observed and predicted house prices.

# Printing the evaluation metrics
print(paste("R-squared: ", r_squared))
## [1] "R-squared:  0.215998743639482"
print(paste("Mean Squared Error: ", mse))
## [1] "Mean Squared Error:  249187320761.844"
# predict: Uses the fitted model to predict price for the dataset.
# R-squared: Measures the proportion of variance in the dependent variable that is predictable from the independent variables.
# Mean Squared Error (MSE): Measures the average squared difference between observed and predicted values.

# Summary of Findings
# Most Influential Features:
 
# Sqft Living: Strongest predictor of house price.
# Bedrooms and Bathrooms: Also important but less than living space.
# Waterfront and View: Premium features that significantly affect price.
# Year Built: Older homes tend to have lower prices.
# Model Performance:
 
# R-squared: Indicates a good fit, meaning the model explains a significant portion of the variability in house prices.
# MSE: Provides a measure of the prediction error.
# Practical Implications:
 
# For Home Buyers: Understanding which features most influence price can guide purchasing decisions.
# For Home Sellers: Highlighting features like living space, waterfront, and view can help justify higher prices.
# For Real Estate Agents: Insights can aid in pricing strategies and advising clients on property improvements.

# Summary of Packages and Functions Used
# tidyverse: A collection of packages for data science tasks.
 
# read_csv: Reads CSV files.
# ggplot2: Creates graphics and visualizations.
# dplyr: Provides functions for data manipulation (e.g., select_if, summarize).
# tidyr: Helps in tidying data (e.g., gather).
# lm: Fits linear models.
 
# predict: Makes predictions based on the fitted model.
 
# summary: Provides summary statistics for data and models.
# 
# cor: Computes correlation matrices.
 
# write_csv: Writes data to CSV files.
 
# Interpretation of the Results
# Summary Statistics and Missing Values: Confirm that the data is complete and provide an overview of the key features.
 
# EDA: Visualize data distributions and relationships between variables to identify patterns and insights.
 
# Correlation Matrix: Understand the relationships between numeric variables and identify potential multicollinearity.
 
# Linear Regression Model: Quantify the effect of each predictor on the response variable (price).
 
# Model Evaluation: Assess model performance using R-squared and MSE.
 
# Predictions: Save the predicted values for further analysis or reporting.

# Save the results
write_csv(data.frame(predictions), "predictions.csv")