## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
## Loading required package: reshape2
## Number of NA's Per-col:
## Student_ID Gender Height_cm
## 0 0 0
## Weight_kg Eleventh_Grade_GPA Twelfth_Grade_GPA
## 0 0 0
## College_GPA Certification_Course Hobbies
## 0 0 0
## Daily_Study_Time_mins Preferred_Study_Time Salary_Expectation
## 0 0 0
## Likes_Degree Social_Media_Use_mins Commute_Time_mins
## 0 0 0
## Stress_Level Financial_Status Part_Time_Job
## 0 0 0
## Average_GPA
## 0
## are there any duplicate ppl?
## length(unique(student_tbl$Student_ID)) == nrow(student_tbl)
## [1] TRUE
## What are the unique value of "Preferred_Study_Time"?
##
## [1] "Morning" "No preference" "Night"
##
## 47 people prefer to study at night
#What percentage of students spend at least 90 minutes on social media and work a part time job? Please provide your answer rounded to three decimal places.
cat("possible values of Part_Time_Job: \n")
## possible values of Part_Time_Job:
unique(student_tbl$Part_Time_Job) #possible values
## [1] "No" "Yes"
part_time_and_sm <- nrow(student_tbl %>%
select(Social_Media_Use_mins, Part_Time_Job) %>%
filter(Part_Time_Job == "Yes" & Social_Media_Use_mins >= 90))
cat(round(part_time_and_sm/nrow(student_tbl), 3) * 100, "% of people work and go on social media that much")
## 5.5 % of people work and go on social media that much
#Which hobby is associated with the lowest average stress levels among students? Please provide the hobby and the associated average stress level rounded to three decimal places.
student_tbl %>% select(Hobbies, Stress_Level) %>% group_by(Hobbies) %>% summarize(avr_stress_per_hobby = round(mean(Stress_Level), 3)) %>% arrange(desc(avr_stress_per_hobby)) #so Cinema
## # A tibble: 4 × 2
## Hobbies avr_stress_per_hobby
## <chr> <dbl>
## 1 Reading books 2.67
## 2 Sports 2.66
## 3 Video Games 2.56
## 4 Cinema 2.51
## # A tibble: 235 × 2
## Student_ID Median_GPA
## <int> <dbl>
## 1 1 3.2
## 2 2 2.86
## 3 3 2.60
## 4 4 2.51
## 5 5 1.63
## 6 6 3.19
## 7 7 2.68
## 8 8 2.62
## 9 9 2.87
## 10 10 2.77
## # ℹ 225 more rows
## # A tibble: 235 × 2
## Student_ID Median_GPA
## <int> <dbl>
## 1 225 3.96
## 2 203 3.92
## 3 30 3.84
## 4 66 3.83
## 5 215 3.83
## 6 226 3.79
## 7 59 3.72
## 8 27 3.67
## 9 38 3.67
## 10 60 3.67
## # ℹ 225 more rows
## [1] 225
#1. Your friend who commutes from Pasadena to UCLA often blames the long drive for their tanking GPA. As a skeptical data scientist, this compels you to investigate whether there truly exists a relationship between commute time and academic performance.
#Formulate a claim on this possible relationship and analyze any trends or patterns in the data that would support your claim.
#Be sure to discuss any limitations or factors in the dataset that may affect the validity of your results.
head(student_tbl)
## Student_ID Gender Height_cm Weight_kg Eleventh_Grade_GPA Twelfth_Grade_GPA
## 1 1 Male 100 58 3.224490 2.723404
## 2 2 Female 90 40 2.857143 3.404255
## 3 3 Male 159 78 2.897959 2.595745
## 4 4 Female 147 20 2.857143 2.510638
## 5 5 Male 170 54 1.632653 2.765957
## 6 6 Female 139 33 3.673469 3.191489
## College_GPA Certification_Course Hobbies Daily_Study_Time_mins
## 1 3.20 No Video Games 188.01559
## 2 2.80 No Cinema 136.77480
## 3 2.20 Yes Cinema 124.78446
## 4 2.32 Yes Reading books 11.09966
## 5 1.20 No Video Games 30.00000
## 6 2.80 Yes Cinema 64.15472
## Preferred_Study_Time Salary_Expectation Likes_Degree Social_Media_Use_mins
## 1 Morning 40000 No 95
## 2 Morning 15000 Yes 65
## 3 No preference 13000 Yes 140
## 4 No preference 1500000 No 90
## 5 Morning 50000 Yes 115
## 6 Night 20000 Yes 40
## Commute_Time_mins Stress_Level Financial_Status Part_Time_Job Average_GPA
## 1 13.84624 2 2 No 3.049298
## 2 49.56766 2 2 No 3.020466
## 3 22.67896 1 2 No 2.564568
## 4 88.31212 2 3 No 2.562594
## 5 145.19791 3 3 No 1.866204
## 6 75.79959 2 3 No 3.221653
#Make useful data set:
my_student_tbl <- student_tbl %>%
rowwise() %>%
mutate(Median_GPA = median(c(Eleventh_Grade_GPA, Twelfth_Grade_GPA, College_GPA))) %>%
ungroup() %>%
select(Student_ID, Eleventh_Grade_GPA, Twelfth_Grade_GPA, College_GPA, Commute_Time_mins, Average_GPA, Median_GPA)
head(my_student_tbl)
## # A tibble: 6 × 7
## Student_ID Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Commute_Time_mins
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 3.22 2.72 3.2 13.8
## 2 2 2.86 3.40 2.8 49.6
## 3 3 2.90 2.60 2.2 22.7
## 4 4 2.86 2.51 2.32 88.3
## 5 5 1.63 2.77 1.2 145.
## 6 6 3.67 3.19 2.8 75.8
## # ℹ 2 more variables: Average_GPA <dbl>, Median_GPA <dbl>
According to Commuter support & Program (UCLA), we will define a commuter as anyone who does not live on campus. However, unfortunately we do not have an exact binary-indicator of “commuter”–so we must do a bit more research with some creativity.
Lets then consider our campus:
Note that I have arbitrarily chose the Math & Science building–as this is a building I most frequently take classes; and, therefore are unrepresentative of non-STEM and other majors. However, for the purposes of this assignment, this measure will do just fine: ## Commuting-Times:
## Based on UCLA's Campus, we suspect that typically, non-commuters are traveling around: 15.08 min to class each day. The HIGHEST travel time would be roughly 20 minutes.
Therefore, we i’ll consider those with a travel time greater than 20 minutes as: commuters.
## # A tibble: 6 × 7
## Student_ID Eleventh_Grade_GPA Twelfth_Grade_GPA College_GPA Commute_Time_mins
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 3.22 2.72 3.2 13.8
## 2 2 2.86 3.40 2.8 49.6
## 3 3 2.90 2.60 2.2 22.7
## 4 4 2.86 2.51 2.32 88.3
## 5 5 1.63 2.77 1.2 145.
## 6 6 3.67 3.19 2.8 75.8
## # ℹ 2 more variables: Average_GPA <dbl>, Median_GPA <dbl>
##
## Welch Two Sample t-test
##
## data: commuters$Average_GPA and non_commuters$Average_GPA
## t = -9.0944, df = 129.89, p-value = 7.014e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.3736875
## sample estimates:
## mean of x mean of y
## 2.852415 3.309337
# Create the binary indicator for commuters
my_student_tbl$commuter_bool <- ifelse(my_student_tbl$Commute_Time_mins >= 20, "Commuter", "Non-Commuter")
# Boxplot to visualize the GPA distribution of commuters vs non-commuters
ggplot(my_student_tbl, aes(x = commuter_bool, y = Average_GPA, fill = commuter_bool)) +
geom_boxplot(alpha = 0.7) +
stat_summary(fun = mean, geom = "point", shape = 20, size = 4, color = "black", position = position_dodge(width = 0.75)) +
labs(title = "Comparison of Average GPA: Commuters vs Non-Commuters",
x = "Group",
y = "Average GPA") +
theme_minimal() +
theme(legend.position = "none")
#2. Many people say that "cash is king" and believe that having a higher economic status is associated with better outcomes in one's academic, professional, and personal life.
#To what degree, if any, does the data support this ideology? Use evidence from the dataset to build your answer.
#Make sure to thoroughly explain your reasoning and note any potential drawbacks to your approach
## Eleventh_Grade_GPA Twelfth_Grade_GPA
## Eleventh_Grade_GPA 1.000000000 0.47325384
## Twelfth_Grade_GPA 0.473253837 1.00000000
## College_GPA 0.465860569 0.42482846
## Commute_Time_mins -0.366295807 -0.36402487
## Average_GPA 0.803807151 0.75825444
## Daily_Study_Time_mins 0.296053031 0.33351847
## Financial_Status -0.014765484 0.01530810
## Stress_Level -0.033775517 0.11923832
## Likes_Degree_numeric_bool -0.044075193 0.02013593
## Certification_Course_numeric_bool 0.256515407 0.04485845
## Part_Time_Job_numeric_bool -0.001368391 0.01508233
## College_GPA Commute_Time_mins Average_GPA
## Eleventh_Grade_GPA 0.46586057 -0.36629581 0.80380715
## Twelfth_Grade_GPA 0.42482846 -0.36402487 0.75825444
## College_GPA 1.00000000 -0.70509920 0.82595544
## Commute_Time_mins -0.70509920 1.00000000 -0.62103290
## Average_GPA 0.82595544 -0.62103290 1.00000000
## Daily_Study_Time_mins 0.57085580 -0.81518919 0.51654830
## Financial_Status 0.04072742 -0.04191137 0.01913415
## Stress_Level 0.06214947 -0.05133578 0.05909861
## Likes_Degree_numeric_bool 0.12506893 -0.10617234 0.04960829
## Certification_Course_numeric_bool 0.19839685 -0.09616923 0.21673826
## Part_Time_Job_numeric_bool 0.04280824 0.04157098 0.02553218
## Daily_Study_Time_mins Financial_Status
## Eleventh_Grade_GPA 0.29605303 -0.01476548
## Twelfth_Grade_GPA 0.33351847 0.01530810
## College_GPA 0.57085580 0.04072742
## Commute_Time_mins -0.81518919 -0.04191137
## Average_GPA 0.51654830 0.01913415
## Daily_Study_Time_mins 1.00000000 0.05551385
## Financial_Status 0.05551385 1.00000000
## Stress_Level 0.05743608 0.18777861
## Likes_Degree_numeric_bool 0.10168286 0.17947938
## Certification_Course_numeric_bool 0.03213578 0.07636272
## Part_Time_Job_numeric_bool -0.01574116 -0.06113211
## Stress_Level Likes_Degree_numeric_bool
## Eleventh_Grade_GPA -0.033775517 -0.04407519
## Twelfth_Grade_GPA 0.119238324 0.02013593
## College_GPA 0.062149470 0.12506893
## Commute_Time_mins -0.051335784 -0.10617234
## Average_GPA 0.059098611 0.04960829
## Daily_Study_Time_mins 0.057436083 0.10168286
## Financial_Status 0.187778612 0.17947938
## Stress_Level 1.000000000 0.01980747
## Likes_Degree_numeric_bool 0.019807467 1.00000000
## Certification_Course_numeric_bool -0.006992094 0.09368154
## Part_Time_Job_numeric_bool -0.006773793 -0.10088463
## Certification_Course_numeric_bool
## Eleventh_Grade_GPA 0.256515407
## Twelfth_Grade_GPA 0.044858446
## College_GPA 0.198396847
## Commute_Time_mins -0.096169231
## Average_GPA 0.216738260
## Daily_Study_Time_mins 0.032135775
## Financial_Status 0.076362717
## Stress_Level -0.006992094
## Likes_Degree_numeric_bool 0.093681540
## Certification_Course_numeric_bool 1.000000000
## Part_Time_Job_numeric_bool 0.128576687
## Part_Time_Job_numeric_bool
## Eleventh_Grade_GPA -0.001368391
## Twelfth_Grade_GPA 0.015082334
## College_GPA 0.042808240
## Commute_Time_mins 0.041570981
## Average_GPA 0.025532177
## Daily_Study_Time_mins -0.015741157
## Financial_Status -0.061132115
## Stress_Level -0.006773793
## Likes_Degree_numeric_bool -0.100884627
## Certification_Course_numeric_bool 0.128576687
## Part_Time_Job_numeric_bool 1.000000000
Consider:
Identify Strong Correlations:
Average GPA and College GPA have a high positive
correlation (0.8259
), suggesting that College GPA strongly
influences the overall Average GPA.
Commute Time and College GPA have a
strong negative correlation (-0.7051
),
indicating that longer commute times are associated with lower College
GPA.
Commute Time and Daily Study Time also have a
strong negative correlation (-0.8152
), suggesting that
students with longer commute times have less time to study.
Identify Weak or Insignificant Correlations:
Financial Status has very weak correlations with
GPA metrics (College GPA
:
0.0407
, Average GPA
: 0.0191
).
This suggests that financial status may not be a strong predictor of
academic performance in this dataset.
Part-Time Job, Likes Degree, and Certification Course show weak correlations with GPA metrics, suggesting they might not be major contributors to academic success.
Stress Level also shows very weak correlations with all GPA metrics, indicating it may not be an influential variable here.
Therefore, we should consider those with moderate to strong correlations (abs(value) > 50%).
##
## Call:
## lm(formula = Average_GPA ~ College_GPA + Eleventh_Grade_GPA +
## Twelfth_Grade_GPA + Commute_Time_mins + Daily_Study_Time_mins,
## data = filtered_student_tbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.733e-15 -5.900e-16 -3.280e-16 4.100e-17 6.258e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.617e-15 3.116e-15 1.803e+00 0.0728 .
## College_GPA 3.333e-01 6.521e-16 5.112e+14 <2e-16 ***
## Eleventh_Grade_GPA 3.333e-01 6.173e-16 5.400e+14 <2e-16 ***
## Twelfth_Grade_GPA 3.333e-01 6.894e-16 4.835e+14 <2e-16 ***
## Commute_Time_mins 2.729e-20 1.568e-17 2.000e-03 0.9986
## Daily_Study_Time_mins 6.668e-18 8.578e-18 7.770e-01 0.4377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.175e-15 on 229 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 5.066e+29 on 5 and 229 DF, p-value: < 2.2e-16
(Intercept): The estimated intercept is
5.617e-15
, which is extremely close to zero. This value is
the predicted Average_GPA
when all other variables are
zero. However, this isn’t practically meaningful since all GPA values
and study times cannot be zero.
College_GPA
,
Eleventh_Grade_GPA
, and
Twelfth_Grade_GPA
:
All have coefficients of 0.333
.
This suggests that each of these GPA metrics contributes equally to the
prediction of Average_GPA
.
These variables are highly significant, as indicated by the
extremely small p-values (<2e-16
),
suggesting a very strong relationship between these individual GPA
metrics and the overall average GPA.
Commute_Time_mins
and
Daily_Study_Time_mins
:
The estimated coefficients are extremely small
(2.729e-20
and 6.668e-18
, respectively),
indicating that these variables have almost no effect on the
Average_GPA
.
The p-values are quite high (0.9986
for Commute_Time_mins
and 0.4377
for
Daily_Study_Time_mins
), suggesting that these variables are
not statistically significant in predicting
Average_GPA
when controlling for other factors.
The extremely high R-squared and adjusted R-squared values suggest multicollinearity—the predictors are likely highly correlated with each other, leading to a near-perfect fit. This could imply redundancy in the model, as the individual GPA metrics might be collinear.
The residuals being almost zero may indicate an overfit model that might not generalize well to other datasets.
Therefore, we will have to re-make the model!!!
# Fit a revised multiple linear regression model using only the significant predictors
revised_model <- lm(Average_GPA ~ College_GPA + Eleventh_Grade_GPA + Twelfth_Grade_GPA, data = filtered_student_tbl)
# Summary of the revised model to understand the influence of each variable
summary(revised_model)
##
## Call:
## lm(formula = Average_GPA ~ College_GPA + Eleventh_Grade_GPA +
## Twelfth_Grade_GPA, data = filtered_student_tbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.394e-15 -5.210e-16 -3.050e-16 -4.500e-17 6.292e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.214e-15 1.965e-15 2.653e+00 0.00854 **
## College_GPA 3.333e-01 5.072e-16 6.572e+14 < 2e-16 ***
## Eleventh_Grade_GPA 3.333e-01 6.157e-16 5.414e+14 < 2e-16 ***
## Twelfth_Grade_GPA 3.333e-01 6.836e-16 4.876e+14 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.168e-15 on 231 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 8.472e+29 on 3 and 231 DF, p-value: < 2.2e-16
# Calculate RMSE
rmse <- sqrt(mean(revised_model$residuals^2))
cat("Root Mean Squared Error (RMSE):", rmse, "\n") #we should expect this cuz our model is looking over fit
## Root Mean Squared Error (RMSE): 4.132403e-15
# Residual plot to check if the residuals are randomly distributed
plot(revised_model$fitted.values, revised_model$residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residual Plot")
abline(h = 0, col = "red")
As we can see, our model seems to have performed well. Most students get around a 3.0. It appears to be roughly normally distributed. The residuals is around 0 so it looks like it doesnt over or under predict. So it appears that there is no systematic issue with no bias. Therefore, I believe I have accurately predicted the predictor variable (GPA).
This model however, may again be over-fitted. We would have to test the model on more data.