GMDAC

Tasks

Data Loading & Initial Exploration

# Read the data
students <- read.csv("data_students.csv")
faculty <- read.csv("data_faculty.csv")

library(visdat)

## Warning: package 'visdat' was built under R version 3.6.3

vis_dat(students)

vis_miss(students)

summary(students)

##       age             cob             course                 faculty   
##  Min.   :20.00   Italy  :266   EU        :249   Business         :249  
##  1st Qu.:24.00   Spain  :265   leadership: 83   Economics        :247  
##  Median :28.00   Germany:152   marketing : 83   Political Science:249  
##  Mean   :27.75   Austria:117   micro     : 83   Sociology        :249  
##  3rd Qu.:32.00   France : 73   discourse : 50   NA's             :  6  
##  Max.   :35.00   (Other):121   (Other)   :446                          
##  NA's   :6       NA's   :  6   NA's      :  6                          
##     gpa_2010        gpa_2011        gpa_2012        gpa_2013    
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.100  
##  1st Qu.:1.300   1st Qu.:1.600   1st Qu.:1.800   1st Qu.:2.000  
##  Median :1.900   Median :2.200   Median :2.400   Median :2.700  
##  Mean   :1.942   Mean   :2.247   Mean   :2.454   Mean   :2.677  
##  3rd Qu.:2.600   3rd Qu.:2.900   3rd Qu.:3.100   3rd Qu.:3.300  
##  Max.   :5.600   Max.   :5.400   Max.   :5.400   Max.   :5.800  
##  NA's   :6       NA's   :6       NA's   :6       NA's   :6      
##     gpa_2014        gpa_2015        gpa_2016       gpa_2017        gpa_2018    
##  Min.   :0.000   Min.   :0.200   Min.   :0.00   Min.   :0.000   Min.   :0.000  
##  1st Qu.:2.200   1st Qu.:2.100   1st Qu.:1.40   1st Qu.:1.000   1st Qu.:2.500  
##  Median :2.800   Median :2.800   Median :2.00   Median :1.600   Median :3.200  
##  Mean   :2.808   Mean   :2.801   Mean   :2.06   Mean   :1.632   Mean   :3.177  
##  3rd Qu.:3.400   3rd Qu.:3.500   3rd Qu.:2.70   3rd Qu.:2.200   3rd Qu.:3.800  
##  Max.   :5.800   Max.   :5.900   Max.   :5.20   Max.   :4.800   Max.   :5.900  
##  NA's   :6       NA's   :6       NA's   :6      NA's   :6       NA's   :6      
##     gpa_2019        gpa_2020       job         lifesat            like      
##  Min.   :0.100   Min.   :1.000   no  :497   Min.   :  0.00   Min.   :0.000  
##  1st Qu.:2.700   1st Qu.:3.300   yes :497   1st Qu.: 25.00   1st Qu.:1.000  
##  Median :3.400   Median :4.000   NA's:  6   Median : 51.50   Median :3.000  
##  Mean   :3.361   Mean   :3.928              Mean   : 51.09   Mean   :2.817  
##  3rd Qu.:4.000   3rd Qu.:4.500              3rd Qu.: 77.00   3rd Qu.:4.000  
##  Max.   :5.900   Max.   :6.000              Max.   :100.00   Max.   :5.000  
##  NA's   :6       NA's   :6                  NA's   :6        NA's   :6      
##             relationship     sex           term         university 
##  In a relationship:329   Female:448   Min.   : 0.000   Berlin:994  
##  Single           :665   Male  :546   1st Qu.: 3.000   NA's  :  6  
##  NA's             :  6   NA's  :  6   Median : 7.000               
##                                       Mean   : 7.177               
##                                       3rd Qu.:11.000               
##                                       Max.   :14.000               
##                                       NA's   :6

library(skimr)

## Warning: package 'skimr' was built under R version 3.6.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

students %>% 
  skim()

Data summary
Name	Piped data
Number of rows	1000
Number of columns	22
_______________________
Column type frequency:
factor	7
numeric	15
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
cob	6	0.99	FALSE	7	Ita: 266, Spa: 265, Ger: 152, Aus: 117
course	6	0.99	FALSE	16	EU: 249, lea: 83, mar: 83, mic: 83
faculty	6	0.99	FALSE	4	Bus: 249, Pol: 249, Soc: 249, Eco: 247
job	6	0.99	FALSE	2	no: 497, yes: 497
relationship	6	0.99	FALSE	2	Sin: 665, In : 329
sex	6	0.99	FALSE	2	Mal: 546, Fem: 448
university	6	0.99	FALSE	1	Ber: 994

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	6	0.99	27.75	4.65	20.0	24.0	28.0	32.0	35.0	▇▆▆▆▇
gpa_2010	6	0.99	1.94	0.96	0.0	1.3	1.9	2.6	5.6	▅▇▆▂▁
gpa_2011	6	0.99	2.25	0.97	0.0	1.6	2.2	2.9	5.4	▂▇▇▃▁
gpa_2012	6	0.99	2.45	0.98	0.0	1.8	2.4	3.1	5.4	▂▆▇▃▁
gpa_2013	6	0.99	2.68	0.98	0.1	2.0	2.7	3.3	5.8	▁▅▇▂▁
gpa_2014	6	0.99	2.81	0.97	0.0	2.2	2.8	3.4	5.8	▁▅▇▅▁
gpa_2015	6	0.99	2.80	0.99	0.2	2.1	2.8	3.5	5.9	▂▆▇▃▁
gpa_2016	6	0.99	2.06	0.93	0.0	1.4	2.0	2.7	5.2	▃▇▇▂▁
gpa_2017	6	0.99	1.63	0.88	0.0	1.0	1.6	2.2	4.8	▃▇▅▂▁
gpa_2018	6	0.99	3.18	1.00	0.0	2.5	3.2	3.8	5.9	▁▃▇▆▁
gpa_2019	6	0.99	3.36	0.96	0.1	2.7	3.4	4.0	5.9	▁▃▇▇▂
gpa_2020	6	0.99	3.93	0.91	1.0	3.3	4.0	4.5	6.0	▁▃▇▇▂
lifesat	6	0.99	51.09	29.59	0.0	25.0	51.5	77.0	100.0	▇▇▇▇▇
like	6	0.99	2.82	1.67	0.0	1.0	3.0	4.0	5.0	▇▅▆▅▆
term	6	0.99	7.18	4.42	0.0	3.0	7.0	11.0	14.0	▇▇▇▇▇

Data Manipulation

# 1.    Create a summary table: 
  # a) the percentage of non-German students, 
  # b) average life satisfaction, 
  # c) percentage of students in a relationship, 
  # d) sex ratio, e) percentage of students over 30, 
  # f) average gpa in 2010 and 
  # g) average number of terms per faculty. 


agePct <- nrow(students[students$age>30,])/nrow(students)

nonGermanPct<- nrow(students[students$cob!="Germany",])/
               nrow(students)

inARelationshipPct <- prop.table(table(students$relationship))[1]
  
summaryTable <- students %>% summarise(
                  avgLifeSat = mean(lifesat,na.rm=TRUE),
                  avgGpa2010 = mean(gpa_2010,na.rm=TRUE),
                  
)

summaryTable$agePct <- agePct
summaryTable$nonGermanPct <- nonGermanPct
summaryTable$inARelationshipPct <- inARelationshipPct

# 2. Create a visual showing differences in average life satisfaction by faculty and relationship status. 
library(ggplot2)
students %>% 
          na.omit() %>% 
          ggplot(aes(x = lifesat)) + 
            geom_bar() + 
            facet_grid(factor(relationship) ~ factor(faculty))+ 
            theme_bw()

# 3.    Combine faculty data with student data into one dataset. Show differences in the average cost of the career by faculty and job status.  

stu_fac <- merge(students, faculty, by="faculty")

library(tidyr)
stu_fac %>% 
  group_by(faculty,job) %>% 
  summarise(meanCostOfCareer = mean(costs)) %>%    
  pivot_wider(names_from = faculty, values_from =      
                meanCostOfCareer)

## # A tibble: 2 x 5
##   job   Business Economics `Political Science` Sociology
##   <fct>    <dbl>     <dbl>               <dbl>     <dbl>
## 1 no       37245     32396               25216     27051
## 2 yes      37245     32396               25216     27051

Given that the cost of the program/career is given at faculty/department level, it does not change with the job status of the student atleast not until we adjust for the student income related to the campus job.

# 4.    Create a visual showing the relationship between life satisfaction and age. 
library(ggplot2)
students %>% 
          na.omit() %>% 
          ggplot(aes( x = age, y = lifesat, group = age)) + 
            geom_boxplot() + 
            theme_classic()

Modelling

Check whether the relationship status has an effect on life satisfaction regardless of the number of terms, age, sex and the expected entry salary after university. If possible, visualize effects.

Some of the variable relationships are shown here:

library(ggplot2)

par(mfrow=c(2,2))
students %>% na.omit() %>% 
              ggplot(aes(y=lifesat, group = relationship)) + 
              geom_boxplot()

students %>% na.omit() %>% 
              ggplot(aes(y=lifesat, group = age)) + 
              geom_boxplot()

students %>% na.omit() %>% 
              ggplot(aes(y=lifesat, group = sex)) + 
              geom_boxplot()

students %>% na.omit() %>% 
              ggplot(aes(y=lifesat, x = term, group = term)) + 
              geom_boxplot()

Model Building

lmdata <- merge(students, faculty, by = "faculty", all.x = TRUE)

# Build linear regression model to test the relationship between the response(lifesat - 0 - 100) and predictor(relationship status - Single. Not single)
model <- lm(lifesat ~ relationship + age + job + term + sex + salary , data=lmdata)  
summary(model)

## 
## Call:
## lm(formula = lifesat ~ relationship + age + job + term + sex + 
##     salary, data = lmdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.125 -24.748   0.253  26.666  53.174 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.568e+01  7.403e+00   6.171 9.92e-10 ***
## relationshipSingle  2.065e+00  1.994e+00   1.036   0.3005    
## age                 1.210e-01  2.022e-01   0.599   0.5496    
## jobyes              8.886e-01  1.882e+00   0.472   0.6370    
## term                1.019e-01  2.128e-01   0.479   0.6321    
## sexMale            -4.707e+00  1.890e+00  -2.490   0.0129 *  
## salary              2.828e-05  5.434e-05   0.520   0.6028    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.56 on 987 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.008131,   Adjusted R-squared:  0.002101 
## F-statistic: 1.348 on 6 and 987 DF,  p-value: 0.2327

The aim of linear regression is to model a continuous variable Y as a mathematical function of one or more X variable(s), so that we can use this regression model to predict the Y when only the X is known. Mathematically, $Y = a0 + X1a1+X2a2+X3a3 $ a1 shows the strength of amount of change unit change in X1 will cause to Y when X2 and X3 are constant (cetris paribis). a0 is the intercept representing the value of Y when all the predictors are 0.

However, this is not a good model. The R-sq is low, the features used here - relationship status, age, term, starting salary in job are not statistically significant given that the p-values are not below 0.05.

Model Diagnostics

library(car)

## Warning: package 'car' was built under R version 3.6.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 3.6.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

# Assessing Outliers
outlierTest(model) # Bonferonni p-value for most extreme obs

## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferroni p
## 565 -1.840006           0.066068           NA

# Influential Observations

# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(students)-length(model$coefficients)-2))
plot(model, which=4, cook.levels=cutoff)

# Influence Plot
influencePlot(model, id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" )

## Warning in plot.window(...): "id.method" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in box(...): "id.method" is not a graphical parameter

## Warning in title(...): "id.method" is not a graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##        StudRes         Hat        CookD
## 304 -0.7328934 0.012557634 0.0009763001
## 388  1.8102974 0.010316498 0.0048689684
## 466  1.6087107 0.011791427 0.0044043055
## 490  0.2498860 0.012030896 0.0001087310
## 565 -1.8400064 0.008760110 0.0042640564
## 933 -1.8376686 0.004824058 0.0023329442

# Normality of Residuals
# qq plot for studentized resid
qqPlot(model, main="QQ Plot")

## [1] 565 933

# distribution of studentized residuals
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

sresid <- studres(model)
hist(sresid, freq=FALSE, main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(model)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.03375909, Df = 1, p = 0.85422

# plot studentized residuals vs. fitted values
spreadLevelPlot(model)

## 
## Suggested power transformation:  1.051388

# Evaluate (Multi)Collinearity
vif(model) # variance inflation factors

## relationship          age          job         term          sex       salary 
##     1.001144     1.005079     1.007709     1.003950     1.006449     1.000567

sqrt(vif(model)) > 2 # problem?

## relationship          age          job         term          sex       salary 
##        FALSE        FALSE        FALSE        FALSE        FALSE        FALSE

# Evaluate Nonlinearity
# component + residual plot
crPlots(model)

# Ceres plots
ceresPlots(model)

## Warning in ceresPlots(model): Factors skipped in drawing CERES plots.

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 49945

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 33859

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.7266e-015

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 9.5133e+008

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 49945

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 33859

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.7266e-015

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 9.5133e+008

# Test for Autocorrelated Errors
durbinWatsonTest(model)

##  lag Autocorrelation D-W Statistic p-value
##    1   -0.0003930608      2.000273   0.988
##  Alternative hypothesis: rho != 0

Test whether having a job has a negative effect on the average grade point average over the last 10 years. If possible, visualize effects.

# model2 = lm(gpa_2010 ~ job, data = students)

gpa_model <- NULL

for (i in seq(2010,2020)) {
  mod = lm(students[,eval(paste0("gpa_",i,sep=''))] ~ students[,"job"])
  
t <- c("call" = as.character(paste0("gpa_",i," ~ job",sep='')), 
                   "intercept" = summary(mod)$coefficients[1], # "interc_pvalue" = summary(mod)$coefficients[2], 
                   "coef" = summary(mod)$coefficients[2], "coef_pvalue" = summary(mod)$coefficients[8]
                   )
  gpa_model <- rbind(gpa_model,t)
  
}

Here I have run simulations of linear regression model using job as the only predictor. For most of the records show this variable to be not significant. However for years 2013 and 2020, having a student job has proven to be a statistically significant and . It has slightly negative impact on the avg grade point in 2013 while positively affecting the avg grade point in 2020.

Forecast the grade point average for next two year (i.e. 2021 and 2022). You can ignore model assumptions (anything goes). Extra points: Visualize different forecast scenarios.
For each year between 2010 and 2020, estimate the predicted probability of having a grade point average above 2 for students with a job vs. students without a job (controlling for sex and age). (hint: Looking for a loop).

Viz, UX & Presentation

Create an application or visual where users can select the data they want to see and the breakdowns they are interested in (be creative, anything goes).

I have created a simple shiny app hosted here: https://priyankaigit.shinyapps.io/GMDAC/

There is scope of many enhancements with respect to allowing the user with choice of visuals and aggregations but this would be my starting work in that direction for the limitation on time I could spend.

Record a max. 3 min screencast or audio of you walking the user through the app and explaining how it works and what the results are.
Recording Not done
Quick Write Up on this:

This Shiny app allows you to view the sample of your data, allowing to switch between which data you would like to look at. Based on the dataset that is chosen, the list of columns populated in the next input selection changes and gives you a visual depending on whether the field/column is numeric or categorical - presenting a histogram or bar chart (frequency plot) respectively.

Maps

# 1.    Create a map of the countries of birth of students. The map should provide information on how many students at the University of Berlin were born in each country.
# 2.    Include in the map the location of the University of Berlin. 

# install.packages("leaflet")
library(leaflet)

## Warning: package 'leaflet' was built under R version 3.6.3

library(maps)

world_map <- map_data("world")  # use of maps library to get lat-long information

# List the Countries of birth
cob_countries <- unique(students$cob)
cob_countries <- students %>% na.omit()  %>% group_by(cob) %>% summarise(stuCnt = n())

uob_coordinates <- list("region" = 'University of Berlin', "lat" = 52.5125, "long" = 13.3269, "stuCnt" = 994)

# Retrieve their map data
cob_countries_maps <- map_data("world", region = cob_countries$cob)

# Compute the centroid as the mean longitude and lattitude
# Used as label coordinate for country's names
cob_countries_maps_data <- cob_countries_maps %>%
  group_by(region) %>%
  summarise(long = mean(long), lat = mean(lat))

cob_countries_maps_data <- merge(cob_countries_maps_data,cob_countries, by.x ="region", by.y ="cob")
cob_countries_maps_data <- rbind(cob_countries_maps_data,uob_coordinates)

cob_countries_maps_data$dest <- as.factor(ifelse(cob_countries_maps_data$region == "University of Berlin",1,0))
colorUniv <- colorFactor(topo.colors(2),cob_countries_maps_data$dest)
  
pal <- colorFactor(
  palette = c( 'red',  'blue'),
  domain = cob_countries_maps_data$dest
)

cob_countries_maps_data %>% leaflet() %>%
#  addTiles  %>% 
  addProviderTiles("CartoDB.Positron") %>%
  addCircles(lng = ~long, lat = ~lat, label = ~region, color = ~pal(dest),weight = 7 )   # weight = ~stuCnt/20