# Read the data
students <- read.csv("data_students.csv")
faculty <- read.csv("data_faculty.csv")
library(visdat)
## Warning: package 'visdat' was built under R version 3.6.3
vis_dat(students)
vis_miss(students)
summary(students)
## age cob course faculty
## Min. :20.00 Italy :266 EU :249 Business :249
## 1st Qu.:24.00 Spain :265 leadership: 83 Economics :247
## Median :28.00 Germany:152 marketing : 83 Political Science:249
## Mean :27.75 Austria:117 micro : 83 Sociology :249
## 3rd Qu.:32.00 France : 73 discourse : 50 NA's : 6
## Max. :35.00 (Other):121 (Other) :446
## NA's :6 NA's : 6 NA's : 6
## gpa_2010 gpa_2011 gpa_2012 gpa_2013
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.100
## 1st Qu.:1.300 1st Qu.:1.600 1st Qu.:1.800 1st Qu.:2.000
## Median :1.900 Median :2.200 Median :2.400 Median :2.700
## Mean :1.942 Mean :2.247 Mean :2.454 Mean :2.677
## 3rd Qu.:2.600 3rd Qu.:2.900 3rd Qu.:3.100 3rd Qu.:3.300
## Max. :5.600 Max. :5.400 Max. :5.400 Max. :5.800
## NA's :6 NA's :6 NA's :6 NA's :6
## gpa_2014 gpa_2015 gpa_2016 gpa_2017 gpa_2018
## Min. :0.000 Min. :0.200 Min. :0.00 Min. :0.000 Min. :0.000
## 1st Qu.:2.200 1st Qu.:2.100 1st Qu.:1.40 1st Qu.:1.000 1st Qu.:2.500
## Median :2.800 Median :2.800 Median :2.00 Median :1.600 Median :3.200
## Mean :2.808 Mean :2.801 Mean :2.06 Mean :1.632 Mean :3.177
## 3rd Qu.:3.400 3rd Qu.:3.500 3rd Qu.:2.70 3rd Qu.:2.200 3rd Qu.:3.800
## Max. :5.800 Max. :5.900 Max. :5.20 Max. :4.800 Max. :5.900
## NA's :6 NA's :6 NA's :6 NA's :6 NA's :6
## gpa_2019 gpa_2020 job lifesat like
## Min. :0.100 Min. :1.000 no :497 Min. : 0.00 Min. :0.000
## 1st Qu.:2.700 1st Qu.:3.300 yes :497 1st Qu.: 25.00 1st Qu.:1.000
## Median :3.400 Median :4.000 NA's: 6 Median : 51.50 Median :3.000
## Mean :3.361 Mean :3.928 Mean : 51.09 Mean :2.817
## 3rd Qu.:4.000 3rd Qu.:4.500 3rd Qu.: 77.00 3rd Qu.:4.000
## Max. :5.900 Max. :6.000 Max. :100.00 Max. :5.000
## NA's :6 NA's :6 NA's :6 NA's :6
## relationship sex term university
## In a relationship:329 Female:448 Min. : 0.000 Berlin:994
## Single :665 Male :546 1st Qu.: 3.000 NA's : 6
## NA's : 6 NA's : 6 Median : 7.000
## Mean : 7.177
## 3rd Qu.:11.000
## Max. :14.000
## NA's :6
library(skimr)
## Warning: package 'skimr' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
students %>%
skim()
| Name | Piped data |
| Number of rows | 1000 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| factor | 7 |
| numeric | 15 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| cob | 6 | 0.99 | FALSE | 7 | Ita: 266, Spa: 265, Ger: 152, Aus: 117 |
| course | 6 | 0.99 | FALSE | 16 | EU: 249, lea: 83, mar: 83, mic: 83 |
| faculty | 6 | 0.99 | FALSE | 4 | Bus: 249, Pol: 249, Soc: 249, Eco: 247 |
| job | 6 | 0.99 | FALSE | 2 | no: 497, yes: 497 |
| relationship | 6 | 0.99 | FALSE | 2 | Sin: 665, In : 329 |
| sex | 6 | 0.99 | FALSE | 2 | Mal: 546, Fem: 448 |
| university | 6 | 0.99 | FALSE | 1 | Ber: 994 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 6 | 0.99 | 27.75 | 4.65 | 20.0 | 24.0 | 28.0 | 32.0 | 35.0 | ▇▆▆▆▇ |
| gpa_2010 | 6 | 0.99 | 1.94 | 0.96 | 0.0 | 1.3 | 1.9 | 2.6 | 5.6 | ▅▇▆▂▁ |
| gpa_2011 | 6 | 0.99 | 2.25 | 0.97 | 0.0 | 1.6 | 2.2 | 2.9 | 5.4 | ▂▇▇▃▁ |
| gpa_2012 | 6 | 0.99 | 2.45 | 0.98 | 0.0 | 1.8 | 2.4 | 3.1 | 5.4 | ▂▆▇▃▁ |
| gpa_2013 | 6 | 0.99 | 2.68 | 0.98 | 0.1 | 2.0 | 2.7 | 3.3 | 5.8 | ▁▅▇▂▁ |
| gpa_2014 | 6 | 0.99 | 2.81 | 0.97 | 0.0 | 2.2 | 2.8 | 3.4 | 5.8 | ▁▅▇▅▁ |
| gpa_2015 | 6 | 0.99 | 2.80 | 0.99 | 0.2 | 2.1 | 2.8 | 3.5 | 5.9 | ▂▆▇▃▁ |
| gpa_2016 | 6 | 0.99 | 2.06 | 0.93 | 0.0 | 1.4 | 2.0 | 2.7 | 5.2 | ▃▇▇▂▁ |
| gpa_2017 | 6 | 0.99 | 1.63 | 0.88 | 0.0 | 1.0 | 1.6 | 2.2 | 4.8 | ▃▇▅▂▁ |
| gpa_2018 | 6 | 0.99 | 3.18 | 1.00 | 0.0 | 2.5 | 3.2 | 3.8 | 5.9 | ▁▃▇▆▁ |
| gpa_2019 | 6 | 0.99 | 3.36 | 0.96 | 0.1 | 2.7 | 3.4 | 4.0 | 5.9 | ▁▃▇▇▂ |
| gpa_2020 | 6 | 0.99 | 3.93 | 0.91 | 1.0 | 3.3 | 4.0 | 4.5 | 6.0 | ▁▃▇▇▂ |
| lifesat | 6 | 0.99 | 51.09 | 29.59 | 0.0 | 25.0 | 51.5 | 77.0 | 100.0 | ▇▇▇▇▇ |
| like | 6 | 0.99 | 2.82 | 1.67 | 0.0 | 1.0 | 3.0 | 4.0 | 5.0 | ▇▅▆▅▆ |
| term | 6 | 0.99 | 7.18 | 4.42 | 0.0 | 3.0 | 7.0 | 11.0 | 14.0 | ▇▇▇▇▇ |
# 1. Create a summary table:
# a) the percentage of non-German students,
# b) average life satisfaction,
# c) percentage of students in a relationship,
# d) sex ratio, e) percentage of students over 30,
# f) average gpa in 2010 and
# g) average number of terms per faculty.
agePct <- nrow(students[students$age>30,])/nrow(students)
nonGermanPct<- nrow(students[students$cob!="Germany",])/
nrow(students)
inARelationshipPct <- prop.table(table(students$relationship))[1]
summaryTable <- students %>% summarise(
avgLifeSat = mean(lifesat,na.rm=TRUE),
avgGpa2010 = mean(gpa_2010,na.rm=TRUE),
)
summaryTable$agePct <- agePct
summaryTable$nonGermanPct <- nonGermanPct
summaryTable$inARelationshipPct <- inARelationshipPct
# 2. Create a visual showing differences in average life satisfaction by faculty and relationship status.
library(ggplot2)
students %>%
na.omit() %>%
ggplot(aes(x = lifesat)) +
geom_bar() +
facet_grid(factor(relationship) ~ factor(faculty))+
theme_bw()
# 3. Combine faculty data with student data into one dataset. Show differences in the average cost of the career by faculty and job status.
stu_fac <- merge(students, faculty, by="faculty")
library(tidyr)
stu_fac %>%
group_by(faculty,job) %>%
summarise(meanCostOfCareer = mean(costs)) %>%
pivot_wider(names_from = faculty, values_from =
meanCostOfCareer)
## # A tibble: 2 x 5
## job Business Economics `Political Science` Sociology
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 no 37245 32396 25216 27051
## 2 yes 37245 32396 25216 27051
Given that the cost of the program/career is given at faculty/department level, it does not change with the job status of the student atleast not until we adjust for the student income related to the campus job.
# 4. Create a visual showing the relationship between life satisfaction and age.
library(ggplot2)
students %>%
na.omit() %>%
ggplot(aes( x = age, y = lifesat, group = age)) +
geom_boxplot() +
theme_classic()
Some of the variable relationships are shown here:
library(ggplot2)
par(mfrow=c(2,2))
students %>% na.omit() %>%
ggplot(aes(y=lifesat, group = relationship)) +
geom_boxplot()
students %>% na.omit() %>%
ggplot(aes(y=lifesat, group = age)) +
geom_boxplot()
students %>% na.omit() %>%
ggplot(aes(y=lifesat, group = sex)) +
geom_boxplot()
students %>% na.omit() %>%
ggplot(aes(y=lifesat, x = term, group = term)) +
geom_boxplot()
Model Building
lmdata <- merge(students, faculty, by = "faculty", all.x = TRUE)
# Build linear regression model to test the relationship between the response(lifesat - 0 - 100) and predictor(relationship status - Single. Not single)
model <- lm(lifesat ~ relationship + age + job + term + sex + salary , data=lmdata)
summary(model)
##
## Call:
## lm(formula = lifesat ~ relationship + age + job + term + sex +
## salary, data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.125 -24.748 0.253 26.666 53.174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.568e+01 7.403e+00 6.171 9.92e-10 ***
## relationshipSingle 2.065e+00 1.994e+00 1.036 0.3005
## age 1.210e-01 2.022e-01 0.599 0.5496
## jobyes 8.886e-01 1.882e+00 0.472 0.6370
## term 1.019e-01 2.128e-01 0.479 0.6321
## sexMale -4.707e+00 1.890e+00 -2.490 0.0129 *
## salary 2.828e-05 5.434e-05 0.520 0.6028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.56 on 987 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.008131, Adjusted R-squared: 0.002101
## F-statistic: 1.348 on 6 and 987 DF, p-value: 0.2327
The aim of linear regression is to model a continuous variable Y as a mathematical function of one or more X variable(s), so that we can use this regression model to predict the Y when only the X is known. Mathematically, $Y = a0 + X1a1+X2a2+X3a3 $ a1 shows the strength of amount of change unit change in X1 will cause to Y when X2 and X3 are constant (cetris paribis). a0 is the intercept representing the value of Y when all the predictors are 0.
However, this is not a good model. The R-sq is low, the features used here - relationship status, age, term, starting salary in job are not statistically significant given that the p-values are not below 0.05.
Model Diagnostics
library(car)
## Warning: package 'car' was built under R version 3.6.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.6.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Assessing Outliers
outlierTest(model) # Bonferonni p-value for most extreme obs
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 565 -1.840006 0.066068 NA
# Influential Observations
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(students)-length(model$coefficients)-2))
plot(model, which=4, cook.levels=cutoff)
# Influence Plot
influencePlot(model, id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" )
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 304 -0.7328934 0.012557634 0.0009763001
## 388 1.8102974 0.010316498 0.0048689684
## 466 1.6087107 0.011791427 0.0044043055
## 490 0.2498860 0.012030896 0.0001087310
## 565 -1.8400064 0.008760110 0.0042640564
## 933 -1.8376686 0.004824058 0.0023329442
# Normality of Residuals
# qq plot for studentized resid
qqPlot(model, main="QQ Plot")
## [1] 565 933
# distribution of studentized residuals
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
sresid <- studres(model)
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(model)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.03375909, Df = 1, p = 0.85422
# plot studentized residuals vs. fitted values
spreadLevelPlot(model)
##
## Suggested power transformation: 1.051388
# Evaluate (Multi)Collinearity
vif(model) # variance inflation factors
## relationship age job term sex salary
## 1.001144 1.005079 1.007709 1.003950 1.006449 1.000567
sqrt(vif(model)) > 2 # problem?
## relationship age job term sex salary
## FALSE FALSE FALSE FALSE FALSE FALSE
# Evaluate Nonlinearity
# component + residual plot
crPlots(model)
# Ceres plots
ceresPlots(model)
## Warning in ceresPlots(model): Factors skipped in drawing CERES plots.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 49945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 33859
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.7266e-015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 9.5133e+008
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 49945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 33859
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.7266e-015
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 9.5133e+008
# Test for Autocorrelated Errors
durbinWatsonTest(model)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.0003930608 2.000273 0.988
## Alternative hypothesis: rho != 0
# model2 = lm(gpa_2010 ~ job, data = students)
gpa_model <- NULL
for (i in seq(2010,2020)) {
mod = lm(students[,eval(paste0("gpa_",i,sep=''))] ~ students[,"job"])
t <- c("call" = as.character(paste0("gpa_",i," ~ job",sep='')),
"intercept" = summary(mod)$coefficients[1], # "interc_pvalue" = summary(mod)$coefficients[2],
"coef" = summary(mod)$coefficients[2], "coef_pvalue" = summary(mod)$coefficients[8]
)
gpa_model <- rbind(gpa_model,t)
}
Here I have run simulations of linear regression model using job as the only predictor. For most of the records show this variable to be not significant. However for years 2013 and 2020, having a student job has proven to be a statistically significant and . It has slightly negative impact on the avg grade point in 2013 while positively affecting the avg grade point in 2020.
Forecast the grade point average for next two year (i.e. 2021 and 2022). You can ignore model assumptions (anything goes). Extra points: Visualize different forecast scenarios.
For each year between 2010 and 2020, estimate the predicted probability of having a grade point average above 2 for students with a job vs. students without a job (controlling for sex and age). (hint: Looking for a loop).
I have created a simple shiny app hosted here: https://priyankaigit.shinyapps.io/GMDAC/
There is scope of many enhancements with respect to allowing the user with choice of visuals and aggregations but this would be my starting work in that direction for the limitation on time I could spend.
This Shiny app allows you to view the sample of your data, allowing to switch between which data you would like to look at. Based on the dataset that is chosen, the list of columns populated in the next input selection changes and gives you a visual depending on whether the field/column is numeric or categorical - presenting a histogram or bar chart (frequency plot) respectively.
# 1. Create a map of the countries of birth of students. The map should provide information on how many students at the University of Berlin were born in each country.
# 2. Include in the map the location of the University of Berlin.
# install.packages("leaflet")
library(leaflet)
## Warning: package 'leaflet' was built under R version 3.6.3
library(maps)
world_map <- map_data("world") # use of maps library to get lat-long information
# List the Countries of birth
cob_countries <- unique(students$cob)
cob_countries <- students %>% na.omit() %>% group_by(cob) %>% summarise(stuCnt = n())
uob_coordinates <- list("region" = 'University of Berlin', "lat" = 52.5125, "long" = 13.3269, "stuCnt" = 994)
# Retrieve their map data
cob_countries_maps <- map_data("world", region = cob_countries$cob)
# Compute the centroid as the mean longitude and lattitude
# Used as label coordinate for country's names
cob_countries_maps_data <- cob_countries_maps %>%
group_by(region) %>%
summarise(long = mean(long), lat = mean(lat))
cob_countries_maps_data <- merge(cob_countries_maps_data,cob_countries, by.x ="region", by.y ="cob")
cob_countries_maps_data <- rbind(cob_countries_maps_data,uob_coordinates)
cob_countries_maps_data$dest <- as.factor(ifelse(cob_countries_maps_data$region == "University of Berlin",1,0))
colorUniv <- colorFactor(topo.colors(2),cob_countries_maps_data$dest)
pal <- colorFactor(
palette = c( 'red', 'blue'),
domain = cob_countries_maps_data$dest
)
cob_countries_maps_data %>% leaflet() %>%
# addTiles %>%
addProviderTiles("CartoDB.Positron") %>%
addCircles(lng = ~long, lat = ~lat, label = ~region, color = ~pal(dest),weight = 7 ) # weight = ~stuCnt/20