###Loading California Housing Dataset

library(readr)
housing <-read.csv("/Users/sharmistaroy/Downloads/housing.csv")

head(housing)
##   longitude latitude housing_median_age total_rooms total_bedrooms population
## 1   -122.23    37.88                 41         880            129        322
## 2   -122.22    37.86                 21        7099           1106       2401
## 3   -122.24    37.85                 52        1467            190        496
## 4   -122.25    37.85                 52        1274            235        558
## 5   -122.25    37.85                 52        1627            280        565
## 6   -122.25    37.85                 52         919            213        413
##   households median_income median_house_value ocean_proximity
## 1        126        8.3252             452600        NEAR BAY
## 2       1138        8.3014             358500        NEAR BAY
## 3        177        7.2574             352100        NEAR BAY
## 4        219        5.6431             341300        NEAR BAY
## 5        259        3.8462             342200        NEAR BAY
## 6        193        4.0368             269700        NEAR BAY

###Creating and Convert Time Column to Date Format

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(fable)
## Loading required package: fabletools
# Create a synthetic time-related column
set.seed(123)
housing$timestamp <- seq(as.Date("2000-01-01"), by = "months", length.out = nrow(housing))

# Convert timestamp to Date
housing$timestamp <- as.Date(housing$timestamp)

Now, we have a synthetic time-related column named timestamp.

###Time Series Analysis Choose a Response-Like Variable, Create tsibble Object and Plot Data

time_series_data <- tsibble(
  timestamp = housing$timestamp,
  median_house_value = housing$median_house_value
)
## Using `timestamp` as index variable.
library(ggplot2)
autoplot(time_series_data, series = "median_house_value") +
  labs(title = "Time Series Plot of 'median_house_value'",
       x = "Time",
       y = "Median House Value")
## Plot variable not specified, automatically selected `.vars =
## median_house_value`
## Warning in geom_line(...): Ignoring unknown parameters: `series`

A tsibble object is created using the ‘date_column’ and ‘median_house_value’ columns. The data is then plotted over time using ggplot2. This visualization helps in understanding the overall trend and patterns in the median house values.

response_tsibble <- as_tsibble(time_series_data, index = "timestamp")

###Linear Regression to detect trends

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
linear_model <- lm(median_house_value ~ timestamp, data = time_series_data)

# Summary of linear model
summary(linear_model)
## 
## Call:
## lm(formula = median_house_value ~ timestamp, data = time_series_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -205095  -86882  -26683   56388  307428 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.919e+05  1.644e+03  116.73   <2e-16 ***
## timestamp   4.587e-02  4.418e-03   10.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115100 on 20638 degrees of freedom
## Multiple R-squared:  0.005196,   Adjusted R-squared:  0.005148 
## F-statistic: 107.8 on 1 and 20638 DF,  p-value: < 2.2e-16

###Smoothing to Detect Seasonality

library(forecast)


smooth_model <- auto.arima(time_series_data$median_house_value)

# Summary of smooth model
summary(smooth_model)
## Series: time_series_data$median_house_value 
## ARIMA(3,1,2) 
## 
## Coefficients:
##          ar1      ar2      ar3      ma1     ma2
##       1.1069  -0.1623  -0.0078  -1.6238  0.6314
## s.e.  0.0282   0.0153   0.0110   0.0273  0.0261
## 
## sigma^2 = 3.206e+09:  log likelihood = -255160.2
## AIC=510332.3   AICc=510332.3   BIC=510379.9
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -90.73546 56614.81 36586.89 -7.28316 20.27985 0.9533785
##                      ACF1
## Training set 3.895652e-05
acf_result <- acf(residuals(smooth_model), plot = FALSE)

# ACF plot visualization
autoplot(acf_result)

acf(response_tsibble$median_house_value)

pacf(response_tsibble$median_house_value)

In this analysis, we explored the time aspect of the California housing dataset. The “date” column was converted into a Date object, and a tsibble object was created for time series analysis. The plot, linear regression, and seasonality detection techniques were applied to understand the dynamics of the “median_house_value” variable over time. The results provide insights into any trends and seasonality present in the dataset.