Week 12 | Data Dive — Time-based Data

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.2

library(forecast)

## Warning: package 'forecast' was built under R version 4.3.2

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(TSA)

## Warning: package 'TSA' was built under R version 4.3.2

## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast

## 
## Attaching package: 'TSA'

## The following objects are masked from 'package:stats':
## 
##     acf, arima

## The following object is masked from 'package:utils':
## 
##     tar

library(tseries)

## Warning: package 'tseries' was built under R version 4.3.2

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(grid)

library(pageviews)

## Warning: package 'pageviews' was built under R version 4.3.2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ readr::spec()   masks TSA::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)

## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test

library(boot)
library(broom)
library(lindia)
library(dplyr)
library(ggplot2)
library(dplyr)
library(glmnet)  # For generalized linear models

## Warning: package 'glmnet' was built under R version 4.3.2

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8

library(caret)

## Warning: package 'caret' was built under R version 4.3.2

## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

mpg<- read_delim("C:/Users/kondo/OneDrive/Desktop/INTRO to Statistics and R/Data Set and work/data.csv", delim = ";",show_col_types = FALSE)

# Download last 12 months of data 
student_views <- article_pageviews(article = "Student", 
                           start = "2023010100",
                           end = "2023100100",
                           granularity = "monthly")
glimpse(student_views)

## Rows: 9
## Columns: 8
## $ project     <chr> "wikipedia", "wikipedia", "wikipedia", "wikipedia", "wikip…
## $ language    <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en"
## $ article     <chr> "Student", "Student", "Student", "Student", "Student", "St…
## $ access      <chr> "all-access", "all-access", "all-access", "all-access", "a…
## $ agent       <chr> "all-agents", "all-agents", "all-agents", "all-agents", "a…
## $ granularity <chr> "monthly", "monthly", "monthly", "monthly", "monthly", "mo…
## $ date        <dttm> 2023-01-01, 2023-02-01, 2023-03-01, 2023-04-01, 2023-05-01…
## $ views       <dbl> 21699, 19772, 22950, 16000, 20826, 23666, 19782, 21416, 2…

# Plot the page views over time
plot(student_views$views, type = "l", 
     main = "Wikipedia Page Views for 'Student'")

# Summary statistics
summary(student_views$views)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16000   19782   21016   20792   21699   23666

# Convert date to character in student_views
student_views <- mutate(student_views, date = as.character(date))
# Create a sequence of dates based on the length of student_views
mpg_dates <- seq(as.Date("2023-01-01"), length.out = nrow(mpg), by = "1 month")

# Assign the dates to the mpg data frame
mpg$date <- mpg_dates


# Convert the "date" column to character
mpg$date <- as.character(mpg$date)

# Perform the left join
merged_data <- left_join(mpg, student_views, by = "date")

# Print the structure of merged_data
str(merged_data)

## spc_tbl_ [4,424 × 45] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Marital status                                : num [1:4424] 1 1 1 1 2 2 1 1 1 1 ...
##  $ Application mode                              : num [1:4424] 17 15 1 17 39 39 1 18 1 1 ...
##  $ Application order                             : num [1:4424] 5 1 5 2 1 1 1 4 3 1 ...
##  $ Course                                        : num [1:4424] 171 9254 9070 9773 8014 ...
##  $ Daytime/evening attendance                      : num [1:4424] 1 1 1 1 0 0 1 1 1 1 ...
##  $ Previous qualification                        : num [1:4424] 1 1 1 1 1 19 1 1 1 1 ...
##  $ Previous qualification (grade)                : num [1:4424] 122 160 122 122 100 ...
##  $ Nacionality                                   : num [1:4424] 1 1 1 1 1 1 1 1 62 1 ...
##  $ Mother's qualification                        : num [1:4424] 19 1 37 38 37 37 19 37 1 1 ...
##  $ Father's qualification                        : num [1:4424] 12 3 37 37 38 37 38 37 1 19 ...
##  $ Mother's occupation                           : num [1:4424] 5 3 9 5 9 9 7 9 9 4 ...
##  $ Father's occupation                           : num [1:4424] 9 3 9 3 9 7 10 9 9 7 ...
##  $ Admission grade                               : num [1:4424] 127 142 125 120 142 ...
##  $ Displaced                                     : num [1:4424] 1 1 1 1 0 0 1 1 0 1 ...
##  $ Educational special needs                     : num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Debtor                                        : num [1:4424] 0 0 0 0 0 1 0 0 0 1 ...
##  $ Tuition fees up to date                       : num [1:4424] 1 0 0 1 1 1 1 0 1 0 ...
##  $ Gender                                        : num [1:4424] 1 1 1 0 0 1 0 1 0 0 ...
##  $ Scholarship holder                            : num [1:4424] 0 0 0 0 0 0 1 0 1 0 ...
##  $ Age at enrollment                             : num [1:4424] 20 19 19 20 45 50 18 22 21 18 ...
##  $ International                                 : num [1:4424] 0 0 0 0 0 0 0 0 1 0 ...
##  $ Curricular units 1st sem (credited)           : num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Curricular units 1st sem (enrolled)           : num [1:4424] 0 6 6 6 6 5 7 5 6 6 ...
##  $ Curricular units 1st sem (evaluations)        : num [1:4424] 0 6 0 8 9 10 9 5 8 9 ...
##  $ Curricular units 1st sem (approved)           : num [1:4424] 0 6 0 6 5 5 7 0 6 5 ...
##  $ Curricular units 1st sem (grade)              : num [1:4424] 0 14 0 13.4 12.3 ...
##  $ Curricular units 1st sem (without evaluations): num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Curricular units 2nd sem (credited)           : num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Curricular units 2nd sem (enrolled)           : num [1:4424] 0 6 6 6 6 5 8 5 6 6 ...
##  $ Curricular units 2nd sem (evaluations)        : num [1:4424] 0 6 0 10 6 17 8 5 7 14 ...
##  $ Curricular units 2nd sem (approved)           : num [1:4424] 0 6 0 5 6 5 8 0 6 2 ...
##  $ Curricular units 2nd sem (grade)              : num [1:4424] 0 13.7 0 12.4 13 ...
##  $ Curricular units 2nd sem (without evaluations): num [1:4424] 0 0 0 0 0 5 0 0 0 0 ...
##  $ Unemployment rate                             : num [1:4424] 10.8 13.9 10.8 9.4 13.9 16.2 15.5 15.5 16.2 8.9 ...
##  $ Inflation rate                                : num [1:4424] 1.4 -0.3 1.4 -0.8 -0.3 0.3 2.8 2.8 0.3 1.4 ...
##  $ GDP                                           : num [1:4424] 1.74 0.79 1.74 -3.12 0.79 -0.92 -4.06 -4.06 -0.92 3.51 ...
##  $ Target                                        : chr [1:4424] "Dropout" "Graduate" "Dropout" "Graduate" ...
##  $ date                                          : chr [1:4424] "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01" ...
##  $ project                                       : chr [1:4424] "wikipedia" "wikipedia" "wikipedia" "wikipedia" ...
##  $ language                                      : chr [1:4424] "en" "en" "en" "en" ...
##  $ article                                       : chr [1:4424] "Student" "Student" "Student" "Student" ...
##  $ access                                        : chr [1:4424] "all-access" "all-access" "all-access" "all-access" ...
##  $ agent                                         : chr [1:4424] "all-agents" "all-agents" "all-agents" "all-agents" ...
##  $ granularity                                   : chr [1:4424] "monthly" "monthly" "monthly" "monthly" ...
##  $ views                                         : num [1:4424] 21699 19772 22950 16000 20826 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Marital status` = col_double(),
##   ..   `Application mode` = col_double(),
##   ..   `Application order` = col_double(),
##   ..   Course = col_double(),
##   ..   `Daytime/evening attendance   ` = col_double(),
##   ..   `Previous qualification` = col_double(),
##   ..   `Previous qualification (grade)` = col_double(),
##   ..   Nacionality = col_double(),
##   ..   `Mother's qualification` = col_double(),
##   ..   `Father's qualification` = col_double(),
##   ..   `Mother's occupation` = col_double(),
##   ..   `Father's occupation` = col_double(),
##   ..   `Admission grade` = col_double(),
##   ..   Displaced = col_double(),
##   ..   `Educational special needs` = col_double(),
##   ..   Debtor = col_double(),
##   ..   `Tuition fees up to date` = col_double(),
##   ..   Gender = col_double(),
##   ..   `Scholarship holder` = col_double(),
##   ..   `Age at enrollment` = col_double(),
##   ..   International = col_double(),
##   ..   `Curricular units 1st sem (credited)` = col_double(),
##   ..   `Curricular units 1st sem (enrolled)` = col_double(),
##   ..   `Curricular units 1st sem (evaluations)` = col_double(),
##   ..   `Curricular units 1st sem (approved)` = col_double(),
##   ..   `Curricular units 1st sem (grade)` = col_double(),
##   ..   `Curricular units 1st sem (without evaluations)` = col_double(),
##   ..   `Curricular units 2nd sem (credited)` = col_double(),
##   ..   `Curricular units 2nd sem (enrolled)` = col_double(),
##   ..   `Curricular units 2nd sem (evaluations)` = col_double(),
##   ..   `Curricular units 2nd sem (approved)` = col_double(),
##   ..   `Curricular units 2nd sem (grade)` = col_double(),
##   ..   `Curricular units 2nd sem (without evaluations)` = col_double(),
##   ..   `Unemployment rate` = col_double(),
##   ..   `Inflation rate` = col_double(),
##   ..   GDP = col_double(),
##   ..   Target = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

# Check the first few rows of merged_data
head(merged_data)

## # A tibble: 6 × 45
##   `Marital status` `Application mode` `Application order` Course
##              <dbl>              <dbl>               <dbl>  <dbl>
## 1                1                 17                   5    171
## 2                1                 15                   1   9254
## 3                1                  1                   5   9070
## 4                1                 17                   2   9773
## 5                2                 39                   1   8014
## 6                2                 39                   1   9991
## # ℹ 41 more variables: `Daytime/evening attendance\t` <dbl>,
## #   `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## #   Nacionality <dbl>, `Mother's qualification` <dbl>,
## #   `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## #   `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## #   `Educational special needs` <dbl>, Debtor <dbl>,
## #   `Tuition fees up to date` <dbl>, Gender <dbl>, …

glimpse(merged_data)

## Rows: 4,424
## Columns: 45
## $ `Marital status`                                 <dbl> 1, 1, 1, 1, 2, 2, 1, …
## $ `Application mode`                               <dbl> 17, 15, 1, 17, 39, 39…
## $ `Application order`                              <dbl> 5, 1, 5, 2, 1, 1, 1, …
## $ Course                                           <dbl> 171, 9254, 9070, 9773…
## $ `Daytime/evening attendance\t`                   <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Previous qualification`                         <dbl> 1, 1, 1, 1, 1, 19, 1,…
## $ `Previous qualification (grade)`                 <dbl> 122.0, 160.0, 122.0, …
## $ Nacionality                                      <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `Mother's qualification`                         <dbl> 19, 1, 37, 38, 37, 37…
## $ `Father's qualification`                         <dbl> 12, 3, 37, 37, 38, 37…
## $ `Mother's occupation`                            <dbl> 5, 3, 9, 5, 9, 9, 7, …
## $ `Father's occupation`                            <dbl> 9, 3, 9, 3, 9, 7, 10,…
## $ `Admission grade`                                <dbl> 127.3, 142.5, 124.8, …
## $ Displaced                                        <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Educational special needs`                      <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ Debtor                                           <dbl> 0, 0, 0, 0, 0, 1, 0, …
## $ `Tuition fees up to date`                        <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ Gender                                           <dbl> 1, 1, 1, 0, 0, 1, 0, …
## $ `Scholarship holder`                             <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ `Age at enrollment`                              <dbl> 20, 19, 19, 20, 45, 5…
## $ International                                    <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (credited)`            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (enrolled)`            <dbl> 0, 6, 6, 6, 6, 5, 7, …
## $ `Curricular units 1st sem (evaluations)`         <dbl> 0, 6, 0, 8, 9, 10, 9,…
## $ `Curricular units 1st sem (approved)`            <dbl> 0, 6, 0, 6, 5, 5, 7, …
## $ `Curricular units 1st sem (grade)`               <dbl> 0.00000, 14.00000, 0.…
## $ `Curricular units 1st sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (credited)`            <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (enrolled)`            <dbl> 0, 6, 6, 6, 6, 5, 8, …
## $ `Curricular units 2nd sem (evaluations)`         <dbl> 0, 6, 0, 10, 6, 17, 8…
## $ `Curricular units 2nd sem (approved)`            <dbl> 0, 6, 0, 5, 6, 5, 8, …
## $ `Curricular units 2nd sem (grade)`               <dbl> 0.00000, 13.66667, 0.…
## $ `Curricular units 2nd sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 5, 0, …
## $ `Unemployment rate`                              <dbl> 10.8, 13.9, 10.8, 9.4…
## $ `Inflation rate`                                 <dbl> 1.4, -0.3, 1.4, -0.8,…
## $ GDP                                              <dbl> 1.74, 0.79, 1.74, -3.…
## $ Target                                           <chr> "Dropout", "Graduate"…
## $ date                                             <chr> "2023-01-01", "2023-0…
## $ project                                          <chr> "wikipedia", "wikiped…
## $ language                                         <chr> "en", "en", "en", "en…
## $ article                                          <chr> "Student", "Student",…
## $ access                                           <chr> "all-access", "all-ac…
## $ agent                                            <chr> "all-agents", "all-ag…
## $ granularity                                      <chr> "monthly", "monthly",…
## $ views                                            <dbl> 21699, 19772, 22950, …

library(tsibble)

## Warning: package 'tsibble' was built under R version 4.3.2

## 
## Attaching package: 'tsibble'

## The following object is masked from 'package:lubridate':
## 
##     interval

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

# Convert to date
merged_data$date <- as.Date(merged_data$date, "%Y-%m-%d") 

# Calculate enrollment rate 
merged_data <- merged_data %>%
  group_by(date) %>%
  mutate(Traget_rate = sum(`Target` == "Dropout")/n())

# Visualize over time
ggplot(merged_data, aes(x = date, y = Traget_rate)) + 
  geom_line() +
  labs(title = "Dropout Rate Over Time",
       y = "Proportion Dropout")

# Statistical models
fit <- lm(Traget_rate ~ date, data = merged_data)
summary(fit) # view model summary

## 
## Call:
## lm(formula = Traget_rate ~ date, data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3615 -0.3311 -0.3012  0.6549  0.7191 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.693e-01  1.714e-02  15.715  < 2e-16 ***
## date        5.987e-07  1.804e-07   3.319 0.000912 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4665 on 4422 degrees of freedom
## Multiple R-squared:  0.002484,   Adjusted R-squared:  0.002259 
## F-statistic: 11.01 on 1 and 4422 DF,  p-value: 0.0009121

# Create tsibble 
# Create tsibble
Target_ts <- merged_data %>%
  select(date, Traget_rate) %>%
  as_tsibble(index = date) 

# Ungroup 
Target_ts <- Target_ts %>%
  ungroup()
# Create the ggplot object
my_plot <- ggplot(data = Target_ts, aes(x = date, y = Traget_rate)) +
  geom_line() +
  labs(title = "Target Rate", x = "Date", y = "Target Rate")

print(my_plot)

# Save the plot as a PDF
#ggsave("C:\\Users\\kondo\\Desktop\\plot.pdf", plot = my_plot, device = "pdf")

# Assuming your date column is named 'date'
Target_ts_subset <- Target_ts %>%
  dplyr::filter(date >= as.Date("2016-01-01"))

# Plot the subset
my_plot <- ggplot(data = Target_ts_subset, aes(x = date, y = Traget_rate)) +
  geom_line() +
  labs(title = "Target Rate", x = "Date", y = "Target Rate")

print(my_plot)

# Fit a linear regression model
linear_model <- lm(Traget_rate ~ date, data = merged_data)

# View the summary of the model
summary(linear_model)

## 
## Call:
## lm(formula = Traget_rate ~ date, data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3615 -0.3311 -0.3012  0.6549  0.7191 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.693e-01  1.714e-02  15.715  < 2e-16 ***
## date        5.987e-07  1.804e-07   3.319 0.000912 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4665 on 4422 degrees of freedom
## Multiple R-squared:  0.002484,   Adjusted R-squared:  0.002259 
## F-statistic: 11.01 on 1 and 4422 DF,  p-value: 0.0009121

It looks like you’ve printed the summary of a linear regression model. Here’s a breakdown of the key information provided:

Residuals:

Min: The minimum value of the residuals. 1Q: The first quartile of the residuals. Median: The median (second quartile) of the residuals. 3Q: The third quartile of the residuals. Max: The maximum value of the residuals. Coefficients:

Intercept: The estimated intercept of the regression equation. date: The estimated coefficient for the predictor variable ‘date’. Estimate: The estimated value of the coefficients. Std. Error: The standard error of the coefficients. t value: The t-value, which is a measure of how many standard errors the coefficient is away from zero. Pr(>|t|): The p-value associated with the t-value, indicating the significance of the corresponding coefficient. Smaller p-values suggest stronger evidence against the null hypothesis. Residual Standard Error:

The estimated standard deviation of the residuals. Multiple R-squared:

The proportion of the variance in the dependent variable explained by the independent variables. In this case, it’s a very small value (0.002484), suggesting that the model explains only a small amount of the variability in the dependent variable. Adjusted R-squared:

A modified version of R-squared that adjusts for the number of predictors in the model. F-statistic:

A measure of how well the overall model fits the data. It tests the null hypothesis that all the coefficients are zero. p-value:

The p-value associated with the F-statistic. A small p-value suggests that the overall model is statistically significant. In summary, the model doesn’t seem to explain much of the variability in the dependent variable, as indicated by the low R-squared value. The coefficient for ‘date’ is statistically significant (p-value < 0.05), but its effect size may be small given the small estimated coefficient and the low R-squared value.

# Smoothing using LOESS with a larger span
smoothed_data <- merged_data %>%
  arrange(date) %>%
  mutate(smoothed_rate = loess(Traget_rate ~ as.numeric(date), span = 1)$fitted)

## Warning: There were 53088 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `smoothed_rate = loess(Traget_rate ~ as.numeric(date), span =
##   1)$fitted`.
## ℹ In group 1: `date = 2023-01-01`.
## Caused by warning in `simpleLoess()`:
## ! span too small.   fewer data values than degrees of freedom.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 53087 remaining warnings.

# Visualize the original and smoothed data
ggplot(smoothed_data, aes(x = date)) +
  geom_line(aes(y = Traget_rate), color = "blue", alpha = 0.5) +
  geom_line(aes(y = smoothed_rate), color = "red") +
  labs(title = "Original vs. Smoothed Dropout Rate Over Time")

## Warning: Removed 4424 rows containing missing values (`geom_line()`).

# ACF
acf(merged_data$Traget_rate, main = "ACF: Dropout Rate")

# PACF
pacf(merged_data$Traget_rate, main = "PACF: Dropout Rate")

Week 12 | Data Dive — Time-based Data

Vaishali Kondoju

2023-11-14