Loading the data

adult_dataa <- read.csv("C:/Users/RAKESH REDDY/OneDrive/Desktop/adult_income_data.csv")

head(adult_dataa)
##   age  workclass fnlwgt     education edunum       maritalstatus
## 1  25    Private 226802          11th      7       Never-married
## 2  38    Private  89814       HS-grad      9  Married-civ-spouse
## 3  28  Local-gov 336951    Assoc-acdm     12  Married-civ-spouse
## 4  44    Private 160323  Some-college     10  Married-civ-spouse
## 5  18          ? 103497  Some-college     10       Never-married
## 6  34    Private 198693          10th      6       Never-married
##           occupation   relationship   race     sex capitalgain capitalloss
## 1  Machine-op-inspct      Own-child  Black    Male           0           0
## 2    Farming-fishing        Husband  White    Male           0           0
## 3    Protective-serv        Husband  White    Male           0           0
## 4  Machine-op-inspct        Husband  Black    Male        7688           0
## 5                  ?      Own-child  White  Female           0           0
## 6      Other-service  Not-in-family  White    Male           0           0
##   hoursperweek  nativecountry  income
## 1           40  United-States  <=50K.
## 2           50  United-States  <=50K.
## 3           40  United-States   >50K.
## 4           40  United-States   >50K.
## 5           30  United-States  <=50K.
## 6           30  United-States  <=50K.

Loading the required libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(pwrss)
## Warning: package 'pwrss' was built under R version 4.3.2
## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test
library(pwr)
library(glm2)
library(ggplot2)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(xts)
## Warning: package 'xts' was built under R version 4.3.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.2
## 
## Attaching package: 'zoo'
## 
## The following object is masked from 'package:tsibble':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

##Creating a time-based column

As I don’t have a time based column in my data, I am creating a new column as Date_of_joining and assigning the random dates within in a specified range using the following code.

# Define the start and end dates
start_date <- as.Date("1978-01-01")
end_date <- as.Date("2023-10-31")  # Adjust the end date as needed

# Generate random dates within the specified range
random_dates <- sample(seq(start_date, end_date, by="days"), nrow(adult_dataa), replace=FALSE)

# Assign the random dates to the Date_of_joining column
adult_dataa$Date_of_joining <- random_dates

Response Variable

Here I am choosing hours per week column as response variable to analyse over the time

response_variable <- adult_dataa$hoursperweek

Create a tsibble object

Here, a tsibble object (ts_data) is created using the as_tsibble function. It converts the data frame adult_dataa into a tsibble, which is a tidy time series data structure.

ts_data <- as_tsibble(adult_dataa, key = NULL, index = Date_of_joining) |>
  fill_gaps() 

Plot the data over time

This code creates a time series plot using ggplot2. It visualizes the “Response_Variable” over time, with the x-axis representing the “Date_of_joining.” This plot provides a visual representation of how the response variable changes over the entire time period.

# an "xts" object separate from the original
adult_xts <- xts(x = ts_data$hoursperweek, 
                  order.by = ts_data$Date_of_joining)

adult_xts <- setNames(adult_xts, "hoursperweek")
adult_xts %>%
  rollapply(width = 20, \(x) mean(x, na.rm = TRUE), fill = FALSE) %>%
  ggplot(mapping = aes(x = Index, y = hoursperweek)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'red', se = FALSE) +
  labs(title = "Hours Per week over time trends") +
  theme_hc()
## `geom_smooth()` using formula = 'y ~ x'

ts_data |>
  filter_index("1991" ~ "1992") |>
  drop_na() |>
  ggplot(mapping = aes(x = Date_of_joining , y = hoursperweek)) +
  geom_point(size=1, shape='O') +
  geom_smooth(span=0.2, color = 'orange', se=FALSE) +
  labs(title = "Hours per week Trends between 1991 and 1992") +
  theme_hc()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ts_data |>
  filter_index("2020-01" ~ "2021-06") |>
  ggplot(mapping = aes(x = Date_of_joining, y = hoursperweek)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'orange', se=FALSE) +
  labs(title = "Hours per week trends") +
  theme_hc()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).