library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(readr)
library(stats)
library(dplyr)
library(xts)
## Warning: package 'xts' was built under R version 4.3.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.2
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:zoo':
## 
##     index
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
my_data <- read_delim("C:/Users/user/Documents/Statistics/Telangana_2018_complete_weather_data.csv",delim=",")
## Rows: 230384 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): District, Mandal, Location, Date
## dbl (6): row_id, temp_min, temp_max, humidity_min, humidity_max, wind_speed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Choose a column of data to analyze over time. This should be a “response-like” variable that is of particular interest.

Response variable - temp_max

I will take “temp_max” column from my dataset to analyze the maximum temperature values recorded over the specified time period.

Create a tsibble object of just the date and response variable. Then, plot your data over time.

missing_date_count <- sum(is.na(my_data$Date))

missing_temp_max_count <- sum(is.na(my_data$temp_max))


cat("Missing values in 'Date' column:", missing_date_count, "\n")
## Missing values in 'Date' column: 0
cat("Missing values in 'temp_max' column:", missing_temp_max_count, "\n")
## Missing values in 'temp_max' column: 0
my_data <- my_data %>% distinct(Date, .keep_all = TRUE)

num_duplicates <- sum(duplicated(my_data$Date))

cat("Number of duplicates in the 'Date' column:", num_duplicates, "\n")
## Number of duplicates in the 'Date' column: 0
my_data$Date <- as.Date(my_data$Date, format = "%d-%m-%Y")


ts_data <- as_tsibble(my_data, index = Date) %>%
  select(Date, temp_max)


ggplot(ts_data, aes(x = Date, y = temp_max)) +
  geom_line() +
  labs(title = "Maximum Temperature Over Time", x = "Date", y = "Maximum Temperature (°C)") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

Use linear regression to detect any upwards or downwards trends.

lm_model <- lm(temp_max ~ Date, data = my_data)


summary(lm_model)
## 
## Call:
## lm(formula = temp_max ~ Date, data = my_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9509 -2.6138 -0.2543  2.4645  7.9068 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 362.435348  46.309310   7.826 1.13e-13 ***
## Date         -0.018582   0.002621  -7.089 1.17e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.413 on 271 degrees of freedom
## Multiple R-squared:  0.1564, Adjusted R-squared:  0.1533 
## F-statistic: 50.26 on 1 and 271 DF,  p-value: 1.17e-11

Based on the output of above linear regression analysis, it appears that the coefficient of the Date variable is negative, with an estimated value of approximately -0.018582. This indicates a statistically significant downward trend in the maximum temperature data over time.

ggplot(ts_data, aes(x = Date, y = temp_max)) +
  geom_line() +
  labs(title = "Maximum Temperature Over Time", x = "Date", y = "Maximum Temperature (°C)") +
  geom_smooth(method = 'lm', color = 'blue', se=FALSE) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  
## `geom_smooth()` using formula = 'y ~ x'

The blue line in this plot represents a linear fit to this time series data. It indicates a trend the maximum temperature decreases as we go from january to october 2018.

subset of the the data

filtered_data <- my_data %>%
  filter(month(Date) %in% c(3, 4, 5))


ts_data <- as_tsibble(filtered_data, index = Date) %>%
  select(Date, temp_max)


ggplot(ts_data, aes(x = Date, y = temp_max)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'blue', se=FALSE) +
  labs(title = "Maximum Temperature in March, April, and May", x = "Date", y = "Maximum Temperature (°C)")
## `geom_smooth()` using formula = 'y ~ x'

The blue line in this plot represents a linear fit to this time series data. It indicates a trend the maximum temperature increases as we go from march to may 2018.

Use smoothing to detect at least one season in your data, and interpret your results

my_data$Date <- as.Date(my_data$Date, format = "%d-%m-%Y")


ts_data <- as_tsibble(my_data, index = Date) %>%
  select(Date, temp_max)

ggplot(ts_data, aes(x = Date, y = temp_max)) +
  geom_line() +
  labs(title = "Maximum Temperature Over Time", x = "Date", y = "Maximum Temperature (°C)") +
  geom_smooth(span=0.3, color = 'blue', se=FALSE) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_hc()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

From the above plot,we can conclude that there is no repeating pattern or cycle(seasonality) in the time series data that occurs at regular intervals.

acf_result <- acf(ts_data$temp_max)

From the above output, the peaks in the ACF plot does not occur at regular intervals.Hence it seems there is no seasonality in the data