adult_dataa <- read.csv("C:/Users/RAKESH REDDY/OneDrive/Desktop/adult_income_data.csv")
head(adult_dataa)
## age workclass fnlwgt education edunum maritalstatus
## 1 25 Private 226802 11th 7 Never-married
## 2 38 Private 89814 HS-grad 9 Married-civ-spouse
## 3 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse
## 4 44 Private 160323 Some-college 10 Married-civ-spouse
## 5 18 ? 103497 Some-college 10 Never-married
## 6 34 Private 198693 10th 6 Never-married
## occupation relationship race sex capitalgain capitalloss
## 1 Machine-op-inspct Own-child Black Male 0 0
## 2 Farming-fishing Husband White Male 0 0
## 3 Protective-serv Husband White Male 0 0
## 4 Machine-op-inspct Husband Black Male 7688 0
## 5 ? Own-child White Female 0 0
## 6 Other-service Not-in-family White Male 0 0
## hoursperweek nativecountry income
## 1 40 United-States <=50K.
## 2 50 United-States <=50K.
## 3 40 United-States >50K.
## 4 40 United-States >50K.
## 5 30 United-States <=50K.
## 6 30 United-States <=50K.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(pwrss)
## Warning: package 'pwrss' was built under R version 4.3.2
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
library(pwr)
library(glm2)
library(ggplot2)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(xts)
## Warning: package 'xts' was built under R version 4.3.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.2
##
## Attaching package: 'zoo'
##
## The following object is masked from 'package:tsibble':
##
## index
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##Creating a time-based column
As I don’t have a time based column in my data, I am creating a new column as Date_of_joining and assigning the random dates within in a specified range using the following code.
# Define the start and end dates
start_date <- as.Date("1978-01-01")
end_date <- as.Date("2023-10-31") # Adjust the end date as needed
# Generate random dates within the specified range
random_dates <- sample(seq(start_date, end_date, by="days"), nrow(adult_dataa), replace=FALSE)
# Assign the random dates to the Date_of_joining column
adult_dataa$Date_of_joining <- random_dates
Here I am choosing hours per week column as response variable to analyse over the time
response_variable <- adult_dataa$hoursperweek
Here, a tsibble object (ts_data) is created using the as_tsibble function. It converts the data frame adult_dataa into a tsibble, which is a tidy time series data structure.
ts_data <- as_tsibble(adult_dataa, key = NULL, index = Date_of_joining) |>
fill_gaps()
This code creates a time series plot using ggplot2. It visualizes the “Response_Variable” over time, with the x-axis representing the “Date_of_joining.” This plot provides a visual representation of how the response variable changes over the entire time period.
# an "xts" object separate from the original
adult_xts <- xts(x = ts_data$hoursperweek,
order.by = ts_data$Date_of_joining)
adult_xts <- setNames(adult_xts, "hoursperweek")
adult_xts %>%
rollapply(width = 20, \(x) mean(x, na.rm = TRUE), fill = FALSE) %>%
ggplot(mapping = aes(x = Index, y = hoursperweek)) +
geom_line() +
geom_smooth(method = 'lm', color = 'red', se = FALSE) +
labs(title = "Hours Per week over time trends") +
theme_hc()
## `geom_smooth()` using formula = 'y ~ x'
ts_data |>
filter_index("1991" ~ "1992") |>
drop_na() |>
ggplot(mapping = aes(x = Date_of_joining , y = hoursperweek)) +
geom_point(size=1, shape='O') +
geom_smooth(span=0.2, color = 'orange', se=FALSE) +
labs(title = "Hours per week Trends between 1991 and 1992") +
theme_hc()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ts_data |>
filter_index("2020-01" ~ "2021-06") |>
ggplot(mapping = aes(x = Date_of_joining, y = hoursperweek)) +
geom_line() +
geom_smooth(method = 'lm', color = 'orange', se=FALSE) +
labs(title = "Hours per week trends") +
theme_hc()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
This code fits a linear regression model to the entire time series data, attempting to capture any linear trends over time.
linear_model <- lm(hoursperweek ~ Date_of_joining, data = as.data.frame(ts_data))
summary(linear_model)
##
## Call:
## lm(formula = hoursperweek ~ Date_of_joining, data = as.data.frame(ts_data))
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.448 -0.500 -0.381 4.592 58.719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.054e+01 2.489e-01 162.908 <2e-16 ***
## Date_of_joining -1.332e-05 2.025e-05 -0.658 0.511
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.48 on 16279 degrees of freedom
## (459 observations deleted due to missingness)
## Multiple R-squared: 2.658e-05, Adjusted R-squared: -3.485e-05
## F-statistic: 0.4327 on 1 and 16279 DF, p-value: 0.5107
The very small coefficient for Date_of_joining and the high p-value suggest that the variable Date_of_joining does not have a significant linear relationship with hoursperweek. The R-squared values are close to zero, indicating that the linear regression model does not explain much of the variability in hoursperweek. ***This maybe be because we randomly assigned dates to our data
# Subsetting data for multiple trends (if needed)
# Example: Subsetting data for the first 4 years
subset_data <- filter(ts_data, Date_of_joining >= as.Date("1976-01-01") & Date_of_joining <= as.Date("1981-01-01"))
subset_linear_model <- lm(hoursperweek ~ Date_of_joining, data = as.data.frame(subset_data))
summary(subset_linear_model)
##
## Call:
## lm(formula = hoursperweek ~ Date_of_joining, data = as.data.frame(subset_data))
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.449 -2.453 -0.446 4.550 58.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.051e+01 4.313e+00 9.393 <2e-16 ***
## Date_of_joining -1.839e-05 1.237e-03 -0.015 0.988
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.73 on 1053 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 2.099e-07, Adjusted R-squared: -0.0009495
## F-statistic: 0.000221 on 1 and 1053 DF, p-value: 0.9881
The negative coefficient for Date_of_joining suggests a small decrease in hoursperweek as Date_of_joining increases, but this relationship is not statistically significant. The p-value for Date_of_joining is 0.169, which is greater than the typical significance level of 0.05. Therefore, we fail to reject the null hypothesis that the coefficient is zero. ***This maybe be because we randomly assigned dates to our data
ts_data |>
index_by(year = floor_date(Date_of_joining, 'halfyear')) |>
summarise(avg_hours = mean(hoursperweek, na.rm = TRUE)) |>
ggplot(mapping = aes(x = year, y = avg_hours )) +
geom_line() +
geom_smooth(span = 0.3, color = 'orange', se=FALSE, ) +
labs(title = "Average hours worked Over Time",
subtitle = "(by half year)") +
scale_x_date(breaks = "1 year", labels = \(x) year(x)) +
theme_hc()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
The average number of hours worked per week has been decreasing over
time.
# Plot ACF and PACF to illustrate seasonality
ts_data <- na.omit(ts_data)
your_acf <- acf(ts_data$hoursperweek, lag.max = 30)
your_pacf <- pacf(ts_data$hoursperweek, lag.max = 30)
# Plot ACF
autoplot(your_acf) +
labs(title = "Autocorrelation Function (ACF)",
x = "Lag",
y = "ACF")
# Plot PACF
autoplot(your_pacf) +
labs(title = "Partial Autocorrelation Function (PACF)",
x = "Lag",
y = "PACF")
The graph shows a decreasing trend over time, indicating that the average number of hours worked per week has been decreasing over time. The PACF plot for the hoursperweek variable shows a significant correlation at lag 12. This suggests that the time series exhibits a seasonal pattern of 12 time periods, which is likely to correspond to monthly seasonality.In addition to the significant correlation at lag 12, there are also smaller but significant correlations at lags 6 and 24. This suggests that the time series may also exhibit a semi-seasonal pattern of 6 time periods and a bi-annual pattern of 24 time periods.The lack of significant correlations at lags greater than 24 suggests that there are no long-term trends or cycles in the data.