library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(TSA)
## Warning: package 'TSA' was built under R version 4.3.2
## Registered S3 methods overwritten by 'TSA':
## method from
## fitted.Arima forecast
## plot.Arima forecast
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(tseries)
## Warning: package 'tseries' was built under R version 4.3.2
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(grid)
library(pageviews)
## Warning: package 'pageviews' was built under R version 4.3.2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ✔ readr 2.1.4 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks TSA::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
library(boot)
library(broom)
library(lindia)
library(dplyr)
library(ggplot2)
library(dplyr)
library(glmnet) # For generalized linear models
## Warning: package 'glmnet' was built under R version 4.3.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
##
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
mpg<- read_delim("C:/Users/kondo/OneDrive/Desktop/INTRO to Statistics and R/Data Set and work/data.csv", delim = ";",show_col_types = FALSE)
# Download last 12 months of data
student_views <- article_pageviews(article = "Student",
start = "2023010100",
end = "2023100100",
granularity = "monthly")
glimpse(student_views)
## Rows: 9
## Columns: 8
## $ project <chr> "wikipedia", "wikipedia", "wikipedia", "wikipedia", "wikip…
## $ language <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en"
## $ article <chr> "Student", "Student", "Student", "Student", "Student", "St…
## $ access <chr> "all-access", "all-access", "all-access", "all-access", "a…
## $ agent <chr> "all-agents", "all-agents", "all-agents", "all-agents", "a…
## $ granularity <chr> "monthly", "monthly", "monthly", "monthly", "monthly", "mo…
## $ date <dttm> 2023-01-01, 2023-02-01, 2023-03-01, 2023-04-01, 2023-05-01…
## $ views <dbl> 21699, 19772, 22950, 16000, 20826, 23666, 19782, 21416, 2…
# Plot the page views over time
plot(student_views$views, type = "l",
main = "Wikipedia Page Views for 'Student'")
# Summary statistics
summary(student_views$views)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16000 19782 21016 20792 21699 23666
# Convert date to character in student_views
student_views <- mutate(student_views, date = as.character(date))
# Create a sequence of dates based on the length of student_views
mpg_dates <- seq(as.Date("2023-01-01"), length.out = nrow(mpg), by = "1 month")
# Assign the dates to the mpg data frame
mpg$date <- mpg_dates
# Convert the "date" column to character
mpg$date <- as.character(mpg$date)
# Perform the left join
merged_data <- left_join(mpg, student_views, by = "date")
# Print the structure of merged_data
str(merged_data)
## spc_tbl_ [4,424 × 45] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Marital status : num [1:4424] 1 1 1 1 2 2 1 1 1 1 ...
## $ Application mode : num [1:4424] 17 15 1 17 39 39 1 18 1 1 ...
## $ Application order : num [1:4424] 5 1 5 2 1 1 1 4 3 1 ...
## $ Course : num [1:4424] 171 9254 9070 9773 8014 ...
## $ Daytime/evening attendance : num [1:4424] 1 1 1 1 0 0 1 1 1 1 ...
## $ Previous qualification : num [1:4424] 1 1 1 1 1 19 1 1 1 1 ...
## $ Previous qualification (grade) : num [1:4424] 122 160 122 122 100 ...
## $ Nacionality : num [1:4424] 1 1 1 1 1 1 1 1 62 1 ...
## $ Mother's qualification : num [1:4424] 19 1 37 38 37 37 19 37 1 1 ...
## $ Father's qualification : num [1:4424] 12 3 37 37 38 37 38 37 1 19 ...
## $ Mother's occupation : num [1:4424] 5 3 9 5 9 9 7 9 9 4 ...
## $ Father's occupation : num [1:4424] 9 3 9 3 9 7 10 9 9 7 ...
## $ Admission grade : num [1:4424] 127 142 125 120 142 ...
## $ Displaced : num [1:4424] 1 1 1 1 0 0 1 1 0 1 ...
## $ Educational special needs : num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
## $ Debtor : num [1:4424] 0 0 0 0 0 1 0 0 0 1 ...
## $ Tuition fees up to date : num [1:4424] 1 0 0 1 1 1 1 0 1 0 ...
## $ Gender : num [1:4424] 1 1 1 0 0 1 0 1 0 0 ...
## $ Scholarship holder : num [1:4424] 0 0 0 0 0 0 1 0 1 0 ...
## $ Age at enrollment : num [1:4424] 20 19 19 20 45 50 18 22 21 18 ...
## $ International : num [1:4424] 0 0 0 0 0 0 0 0 1 0 ...
## $ Curricular units 1st sem (credited) : num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
## $ Curricular units 1st sem (enrolled) : num [1:4424] 0 6 6 6 6 5 7 5 6 6 ...
## $ Curricular units 1st sem (evaluations) : num [1:4424] 0 6 0 8 9 10 9 5 8 9 ...
## $ Curricular units 1st sem (approved) : num [1:4424] 0 6 0 6 5 5 7 0 6 5 ...
## $ Curricular units 1st sem (grade) : num [1:4424] 0 14 0 13.4 12.3 ...
## $ Curricular units 1st sem (without evaluations): num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
## $ Curricular units 2nd sem (credited) : num [1:4424] 0 0 0 0 0 0 0 0 0 0 ...
## $ Curricular units 2nd sem (enrolled) : num [1:4424] 0 6 6 6 6 5 8 5 6 6 ...
## $ Curricular units 2nd sem (evaluations) : num [1:4424] 0 6 0 10 6 17 8 5 7 14 ...
## $ Curricular units 2nd sem (approved) : num [1:4424] 0 6 0 5 6 5 8 0 6 2 ...
## $ Curricular units 2nd sem (grade) : num [1:4424] 0 13.7 0 12.4 13 ...
## $ Curricular units 2nd sem (without evaluations): num [1:4424] 0 0 0 0 0 5 0 0 0 0 ...
## $ Unemployment rate : num [1:4424] 10.8 13.9 10.8 9.4 13.9 16.2 15.5 15.5 16.2 8.9 ...
## $ Inflation rate : num [1:4424] 1.4 -0.3 1.4 -0.8 -0.3 0.3 2.8 2.8 0.3 1.4 ...
## $ GDP : num [1:4424] 1.74 0.79 1.74 -3.12 0.79 -0.92 -4.06 -4.06 -0.92 3.51 ...
## $ Target : chr [1:4424] "Dropout" "Graduate" "Dropout" "Graduate" ...
## $ date : chr [1:4424] "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01" ...
## $ project : chr [1:4424] "wikipedia" "wikipedia" "wikipedia" "wikipedia" ...
## $ language : chr [1:4424] "en" "en" "en" "en" ...
## $ article : chr [1:4424] "Student" "Student" "Student" "Student" ...
## $ access : chr [1:4424] "all-access" "all-access" "all-access" "all-access" ...
## $ agent : chr [1:4424] "all-agents" "all-agents" "all-agents" "all-agents" ...
## $ granularity : chr [1:4424] "monthly" "monthly" "monthly" "monthly" ...
## $ views : num [1:4424] 21699 19772 22950 16000 20826 ...
## - attr(*, "spec")=
## .. cols(
## .. `Marital status` = col_double(),
## .. `Application mode` = col_double(),
## .. `Application order` = col_double(),
## .. Course = col_double(),
## .. `Daytime/evening attendance ` = col_double(),
## .. `Previous qualification` = col_double(),
## .. `Previous qualification (grade)` = col_double(),
## .. Nacionality = col_double(),
## .. `Mother's qualification` = col_double(),
## .. `Father's qualification` = col_double(),
## .. `Mother's occupation` = col_double(),
## .. `Father's occupation` = col_double(),
## .. `Admission grade` = col_double(),
## .. Displaced = col_double(),
## .. `Educational special needs` = col_double(),
## .. Debtor = col_double(),
## .. `Tuition fees up to date` = col_double(),
## .. Gender = col_double(),
## .. `Scholarship holder` = col_double(),
## .. `Age at enrollment` = col_double(),
## .. International = col_double(),
## .. `Curricular units 1st sem (credited)` = col_double(),
## .. `Curricular units 1st sem (enrolled)` = col_double(),
## .. `Curricular units 1st sem (evaluations)` = col_double(),
## .. `Curricular units 1st sem (approved)` = col_double(),
## .. `Curricular units 1st sem (grade)` = col_double(),
## .. `Curricular units 1st sem (without evaluations)` = col_double(),
## .. `Curricular units 2nd sem (credited)` = col_double(),
## .. `Curricular units 2nd sem (enrolled)` = col_double(),
## .. `Curricular units 2nd sem (evaluations)` = col_double(),
## .. `Curricular units 2nd sem (approved)` = col_double(),
## .. `Curricular units 2nd sem (grade)` = col_double(),
## .. `Curricular units 2nd sem (without evaluations)` = col_double(),
## .. `Unemployment rate` = col_double(),
## .. `Inflation rate` = col_double(),
## .. GDP = col_double(),
## .. Target = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Check the first few rows of merged_data
head(merged_data)
## # A tibble: 6 × 45
## `Marital status` `Application mode` `Application order` Course
## <dbl> <dbl> <dbl> <dbl>
## 1 1 17 5 171
## 2 1 15 1 9254
## 3 1 1 5 9070
## 4 1 17 2 9773
## 5 2 39 1 8014
## 6 2 39 1 9991
## # ℹ 41 more variables: `Daytime/evening attendance\t` <dbl>,
## # `Previous qualification` <dbl>, `Previous qualification (grade)` <dbl>,
## # Nacionality <dbl>, `Mother's qualification` <dbl>,
## # `Father's qualification` <dbl>, `Mother's occupation` <dbl>,
## # `Father's occupation` <dbl>, `Admission grade` <dbl>, Displaced <dbl>,
## # `Educational special needs` <dbl>, Debtor <dbl>,
## # `Tuition fees up to date` <dbl>, Gender <dbl>, …
glimpse(merged_data)
## Rows: 4,424
## Columns: 45
## $ `Marital status` <dbl> 1, 1, 1, 1, 2, 2, 1, …
## $ `Application mode` <dbl> 17, 15, 1, 17, 39, 39…
## $ `Application order` <dbl> 5, 1, 5, 2, 1, 1, 1, …
## $ Course <dbl> 171, 9254, 9070, 9773…
## $ `Daytime/evening attendance\t` <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Previous qualification` <dbl> 1, 1, 1, 1, 1, 19, 1,…
## $ `Previous qualification (grade)` <dbl> 122.0, 160.0, 122.0, …
## $ Nacionality <dbl> 1, 1, 1, 1, 1, 1, 1, …
## $ `Mother's qualification` <dbl> 19, 1, 37, 38, 37, 37…
## $ `Father's qualification` <dbl> 12, 3, 37, 37, 38, 37…
## $ `Mother's occupation` <dbl> 5, 3, 9, 5, 9, 9, 7, …
## $ `Father's occupation` <dbl> 9, 3, 9, 3, 9, 7, 10,…
## $ `Admission grade` <dbl> 127.3, 142.5, 124.8, …
## $ Displaced <dbl> 1, 1, 1, 1, 0, 0, 1, …
## $ `Educational special needs` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ Debtor <dbl> 0, 0, 0, 0, 0, 1, 0, …
## $ `Tuition fees up to date` <dbl> 1, 0, 0, 1, 1, 1, 1, …
## $ Gender <dbl> 1, 1, 1, 0, 0, 1, 0, …
## $ `Scholarship holder` <dbl> 0, 0, 0, 0, 0, 0, 1, …
## $ `Age at enrollment` <dbl> 20, 19, 19, 20, 45, 5…
## $ International <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (credited)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 1st sem (enrolled)` <dbl> 0, 6, 6, 6, 6, 5, 7, …
## $ `Curricular units 1st sem (evaluations)` <dbl> 0, 6, 0, 8, 9, 10, 9,…
## $ `Curricular units 1st sem (approved)` <dbl> 0, 6, 0, 6, 5, 5, 7, …
## $ `Curricular units 1st sem (grade)` <dbl> 0.00000, 14.00000, 0.…
## $ `Curricular units 1st sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (credited)` <dbl> 0, 0, 0, 0, 0, 0, 0, …
## $ `Curricular units 2nd sem (enrolled)` <dbl> 0, 6, 6, 6, 6, 5, 8, …
## $ `Curricular units 2nd sem (evaluations)` <dbl> 0, 6, 0, 10, 6, 17, 8…
## $ `Curricular units 2nd sem (approved)` <dbl> 0, 6, 0, 5, 6, 5, 8, …
## $ `Curricular units 2nd sem (grade)` <dbl> 0.00000, 13.66667, 0.…
## $ `Curricular units 2nd sem (without evaluations)` <dbl> 0, 0, 0, 0, 0, 5, 0, …
## $ `Unemployment rate` <dbl> 10.8, 13.9, 10.8, 9.4…
## $ `Inflation rate` <dbl> 1.4, -0.3, 1.4, -0.8,…
## $ GDP <dbl> 1.74, 0.79, 1.74, -3.…
## $ Target <chr> "Dropout", "Graduate"…
## $ date <chr> "2023-01-01", "2023-0…
## $ project <chr> "wikipedia", "wikiped…
## $ language <chr> "en", "en", "en", "en…
## $ article <chr> "Student", "Student",…
## $ access <chr> "all-access", "all-ac…
## $ agent <chr> "all-agents", "all-ag…
## $ granularity <chr> "monthly", "monthly",…
## $ views <dbl> 21699, 19772, 22950, …
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
# Convert to date
merged_data$date <- as.Date(merged_data$date, "%Y-%m-%d")
# Calculate enrollment rate
merged_data <- merged_data %>%
group_by(date) %>%
mutate(Traget_rate = sum(`Target` == "Dropout")/n())
# Visualize over time
ggplot(merged_data, aes(x = date, y = Traget_rate)) +
geom_line() +
labs(title = "Dropout Rate Over Time",
y = "Proportion Dropout")
# Statistical models
fit <- lm(Traget_rate ~ date, data = merged_data)
summary(fit) # view model summary
##
## Call:
## lm(formula = Traget_rate ~ date, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3615 -0.3311 -0.3012 0.6549 0.7191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.693e-01 1.714e-02 15.715 < 2e-16 ***
## date 5.987e-07 1.804e-07 3.319 0.000912 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4665 on 4422 degrees of freedom
## Multiple R-squared: 0.002484, Adjusted R-squared: 0.002259
## F-statistic: 11.01 on 1 and 4422 DF, p-value: 0.0009121
# Create tsibble
# Create tsibble
Target_ts <- merged_data %>%
select(date, Traget_rate) %>%
as_tsibble(index = date)
# Ungroup
Target_ts <- Target_ts %>%
ungroup()
# Create the ggplot object
my_plot <- ggplot(data = Target_ts, aes(x = date, y = Traget_rate)) +
geom_line() +
labs(title = "Target Rate", x = "Date", y = "Target Rate")
print(my_plot)
# Save the plot as a PDF
#ggsave("C:\\Users\\kondo\\Desktop\\plot.pdf", plot = my_plot, device = "pdf")
# Assuming your date column is named 'date'
Target_ts_subset <- Target_ts %>%
dplyr::filter(date >= as.Date("2016-01-01"))
# Plot the subset
my_plot <- ggplot(data = Target_ts_subset, aes(x = date, y = Traget_rate)) +
geom_line() +
labs(title = "Target Rate", x = "Date", y = "Target Rate")
print(my_plot)
# Fit a linear regression model
linear_model <- lm(Traget_rate ~ date, data = merged_data)
# View the summary of the model
summary(linear_model)
##
## Call:
## lm(formula = Traget_rate ~ date, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3615 -0.3311 -0.3012 0.6549 0.7191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.693e-01 1.714e-02 15.715 < 2e-16 ***
## date 5.987e-07 1.804e-07 3.319 0.000912 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4665 on 4422 degrees of freedom
## Multiple R-squared: 0.002484, Adjusted R-squared: 0.002259
## F-statistic: 11.01 on 1 and 4422 DF, p-value: 0.0009121
It looks like you’ve printed the summary of a linear regression model. Here’s a breakdown of the key information provided:
Residuals:
Min: The minimum value of the residuals. 1Q: The first quartile of the residuals. Median: The median (second quartile) of the residuals. 3Q: The third quartile of the residuals. Max: The maximum value of the residuals. Coefficients:
Intercept: The estimated intercept of the regression equation. date: The estimated coefficient for the predictor variable ‘date’. Estimate: The estimated value of the coefficients. Std. Error: The standard error of the coefficients. t value: The t-value, which is a measure of how many standard errors the coefficient is away from zero. Pr(>|t|): The p-value associated with the t-value, indicating the significance of the corresponding coefficient. Smaller p-values suggest stronger evidence against the null hypothesis. Residual Standard Error:
The estimated standard deviation of the residuals. Multiple R-squared:
The proportion of the variance in the dependent variable explained by the independent variables. In this case, it’s a very small value (0.002484), suggesting that the model explains only a small amount of the variability in the dependent variable. Adjusted R-squared:
A modified version of R-squared that adjusts for the number of predictors in the model. F-statistic:
A measure of how well the overall model fits the data. It tests the null hypothesis that all the coefficients are zero. p-value:
The p-value associated with the F-statistic. A small p-value suggests that the overall model is statistically significant. In summary, the model doesn’t seem to explain much of the variability in the dependent variable, as indicated by the low R-squared value. The coefficient for ‘date’ is statistically significant (p-value < 0.05), but its effect size may be small given the small estimated coefficient and the low R-squared value.
# Smoothing using LOESS with a larger span
smoothed_data <- merged_data %>%
arrange(date) %>%
mutate(smoothed_rate = loess(Traget_rate ~ as.numeric(date), span = 1)$fitted)
## Warning: There were 53088 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `smoothed_rate = loess(Traget_rate ~ as.numeric(date), span =
## 1)$fitted`.
## ℹ In group 1: `date = 2023-01-01`.
## Caused by warning in `simpleLoess()`:
## ! span too small. fewer data values than degrees of freedom.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 53087 remaining warnings.
# Visualize the original and smoothed data
ggplot(smoothed_data, aes(x = date)) +
geom_line(aes(y = Traget_rate), color = "blue", alpha = 0.5) +
geom_line(aes(y = smoothed_rate), color = "red") +
labs(title = "Original vs. Smoothed Dropout Rate Over Time")
## Warning: Removed 4424 rows containing missing values (`geom_line()`).
# ACF
acf(merged_data$Traget_rate, main = "ACF: Dropout Rate")
# PACF
pacf(merged_data$Traget_rate, main = "PACF: Dropout Rate")