STAT183 Group Project Final Analysis

### IMPORTANT:
### UPLOAD THE FINAL EXCEL SHEET DR. XU PROVIDED
### includes the two new columns
data <- read_excel("University Honors Students (Fall 2017 - Fall 2021)(Final) with two new columns.xlsx")

Cleaning and Processing Data

# Modify the data set to rename N/A or missing values from specific columns.
data %<>%
  mutate_at(vars(`First Generation`, `Low Income`), ~replace_na(., "N")) %>%
    mutate_at(vars(`Capstone Project Completion`), ~replace_na(., "No")) %>%
      mutate_at(vars(`Gender`,`Admit Type`), ~replace_na(., "Others"))

# Change capstone completion to binary variable
data$`Capstone Project Completion` <- ifelse(data$`Capstone Project Completion` == "Yes", 1, 0)

# Fourth, Fifth, and Sixth Years Students in the Admitted Honors Cohort be combined into a new variable, “Other”
data$`Admitted Honors Cohort`[data$`Admitted Honors Cohort` == "New Second Year (Waitlist)"] <- "Second Year"
data$`Admitted Honors Cohort`[data$`Admitted Honors Cohort` == "New Second Year"] <- "Second Year"
data$`Admitted Honors Cohort`[data$`Admitted Honors Cohort` == "New Third Year"] <- "Third Year"
data$`Admitted Honors Cohort`[data$`Admitted Honors Cohort` == "Fourth Year" | data$`Admitted Honors Cohort` == "Fifth Year" | data$`Admitted Honors Cohort` == "Sixth Year"] <- "Others"


#  PP and ED be combined as “Others” under Colleges due to small sample sizes
data$`College`[data$`College` == "PP" | data$`College` == "ED"] <- "Others"

# disregard students’ majors and merely use their colleges
data$`Major` <- NULL

# IPEDS Ethnicity of Native Hawaiian or Pacific Islander be combined with Unknown to make an Other category due to small sample sizes
data$`IPEDS Ethnicity`[data$`IPEDS Ethnicity` == "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER" | data$`IPEDS Ethnicity` == "UNKNOWN"] <- "OTHERS"

# Any admit type beginning with “T” can be consolidated into a “Transfer” category, and any admit type beginning with “F” can be consolidated into a “Freshman” category. If there are any outlying admit types, they can be consolidated into an “Other” category. 
data$`Admit Type` <- sub("T.*", "T",data$`Admit Type`)
data$`Admit Type` <- sub("F.*", "F",data$`Admit Type`)
data$`Admit Type` <- sub("B.*", "Others",data$`Admit Type`)
data$`Admit Type` <- sub("A.*", "Others",data$`Admit Type`)

data

## # A tibble: 1,015 x 19
##    Student Gender `IPEDS Ethnicity`  `Low Income` `First Generati~` `Admit Type`
##      <dbl> <chr>  <chr>              <chr>        <chr>             <chr>       
##  1       1 F      WHITE              N            Y                 F           
##  2       2 F      WHITE              N            N                 F           
##  3       3 M      WHITE              N            N                 F           
##  4       4 F      ASIAN              N            N                 F           
##  5       5 M      MULTI-RACIAL       N            N                 F           
##  6       6 F      WHITE              Y            N                 F           
##  7       7 M      HISPANIC OR LATINO N            Y                 F           
##  8       8 M      WHITE              N            N                 T           
##  9       9 M      WHITE              Y            N                 F           
## 10      10 M      WHITE              N            Y                 T           
## # ... with 1,005 more rows, and 13 more variables:
## #   `Evaluated High School GPA` <dbl>, `Highlander Scholarship` <chr>,
## #   `Chancellor's Scholarship` <chr>, `Regents' Scholarship` <chr>,
## #   `Previous School` <chr>, College <chr>, `UCR Entry Action` <chr>,
## #   `Admitted Honors Cohort` <chr>, `Permanent City` <chr>,
## #   `Permanent State` <chr>, `Capstone Project Completion` <dbl>,
## #   Permanent.Region <chr>, Permanent.County <chr>

Descriptive Analysis: Tables

print("Number of Students Completed Capstone by Ethnicity:")

## [1] "Number of Students Completed Capstone by Ethnicity:"

addmargins(table(data$`IPEDS Ethnicity`, data$`Capstone Project Completion`))

##                            
##                                0    1  Sum
##   ASIAN                      164  142  306
##   BLACK OR AFRICAN AMERICAN   13   13   26
##   HISPANIC OR LATINO         210  150  360
##   MULTI-RACIAL                35   38   73
##   NON-RESIDENT ALIEN          16    7   23
##   OTHERS                       7    6   13
##   WHITE                       99  115  214
##   Sum                        544  471 1015

cat("\n")

print("Number of Students Completed Capstone by Gender:")

## [1] "Number of Students Completed Capstone by Gender:"

addmargins(table(data$`Gender`, data$`Capstone Project Completion`))

##         
##             0    1  Sum
##   F       340  281  621
##   M       198  183  381
##   N         4    4    8
##   Others    2    3    5
##   Sum     544  471 1015

cat("\n")

print("Number of Students Completed Capstone by Low Income:")

## [1] "Number of Students Completed Capstone by Low Income:"

addmargins(table(data$`Low Income`, data$`Capstone Project Completion`))

##      
##          0    1  Sum
##   N    381  345  726
##   Y    163  126  289
##   Sum  544  471 1015

cat("\n")

print("Number of Students Completed Capstone by First Generation:")

## [1] "Number of Students Completed Capstone by First Generation:"

addmargins(table(data$`First Generation`, data$`Capstone Project Completion`))

##      
##          0    1  Sum
##   N    260  281  541
##   Y    284  190  474
##   Sum  544  471 1015

cat("\n")

print("Number of Students Completed Capstone by Admit Type:")

## [1] "Number of Students Completed Capstone by Admit Type:"

addmargins(table(data$`Admit Type`, data$`Capstone Project Completion`))

##         
##             0    1  Sum
##   F       419  374  793
##   Others    8    8   16
##   T       117   89  206
##   Sum     544  471 1015

cat("\n")

print("Number of Students Completed Capstone by College:")

## [1] "Number of Students Completed Capstone by College:"

addmargins(table(data$`College`, data$`Capstone Project Completion`))

##         
##             0    1  Sum
##   BU       30   23   53
##   EN       72   61  133
##   HS      211  194  405
##   NA      227  185  412
##   Others    4    8   12
##   Sum     544  471 1015

cat("\n")

print("Number of Students Completed Capstone by UCR Entry Action:")

## [1] "Number of Students Completed Capstone by UCR Entry Action:"

addmargins(table(data$`UCR Entry Action`, data$`Capstone Project Completion`))

##                      
##                          0    1  Sum
##   First Time Freshman  424  379  803
##   First Time Transfer  120   92  212
##   Sum                  544  471 1015

cat("\n")

print("Number of Students Completed Capstone by Admitted Honors Cohort:")

## [1] "Number of Students Completed Capstone by Admitted Honors Cohort:"

addmargins(table(data$`Admitted Honors Cohort`, data$`Capstone Project Completion`))

##                    
##                        0    1  Sum
##   First Year         185   74  259
##   Incoming Transfer  103   70  173
##   Others              43  108  151
##   Second Year        136  112  248
##   Third Year          77  107  184
##   Sum                544  471 1015

cat("\n")

print("Number of Students Completed Capstone by Permanent Region:")

## [1] "Number of Students Completed Capstone by Permanent Region:"

addmargins(table(data$`Permanent.Region`, data$`Capstone Project Completion`))

##         
##             0    1  Sum
##   1         1    7    8
##   10       31   21   52
##   2         1    0    1
##   3        45   38   83
##   4         3    4    7
##   5        10   10   20
##   6        12    9   21
##   7       248  224  472
##   8       129  100  229
##   9        44   48   92
##   other     5    1    6
##   Other     6    4   10
##   others    5    3    8
##   Sum     540  469 1009

cat("\n")

print("Number of Students Completed Capstone by Permanent County:")

## [1] "Number of Students Completed Capstone by Permanent County:"

addmargins(table(data$`Permanent.County`, data$`Capstone Project Completion`))

##                         
##                             0    1  Sum
##   Alameda                  12    9   21
##   Alameda County            2    1    3
##   Butte                     0    1    1
##   Contra Costa              4    5    9
##   Fresno                    1    1    2
##   Imperial                  1    1    2
##   Imperial County           0    1    1
##   Kern                      7    3   10
##   Kern County               0    1    1
##   Kings                     1    0    1
##   Los Angekes               1    0    1
##   Los Angeles             106   87  193
##   Los Angeles County       22   13   35
##   Madera                    0    1    1
##   Marin                     0    2    2
##   Merced                    1    1    2
##   Monterey County           1    0    1
##   Napa                      1    0    1
##   Orange                   41   40   81
##   Orange County             2    8   10
##   other                     5    1    6
##   Other                     5    4    9
##   others                    5    4    9
##   Others                    1    0    1
##   Placer                    1    1    2
##   Riverside               128   97  225
##   RIverside                 1    2    3
##   Riverside County         33   15   48
##   Sacramento                0    3    3
##   Sacramento County         0    1    1
##   San Bernadino             9   13   22
##   San Bernadino County     14   31   45
##   San Bernardino           63   66  129
##   San Diego                30   11   41
##   San Diego County          1    8    9
##   San Francisco             8    1    9
##   San Francisco County      2    0    2
##   San Joaquin               1    1    2
##   San Joaquin County        1    0    1
##   San Luis Obispo           1    0    1
##   San Luis Obispo County    0    1    1
##   San Mateo                 0    2    2
##   San Mateo County          1    0    1
##   Santa Barbara             1    1    2
##   Santa Barbra County       0    1    1
##   Santa Clara              11   17   28
##   Santa Clara County        4    1    5
##   Santa Cruz                1    1    2
##   Shasta                    0    1    1
##   Solano                    1    0    1
##   Stanislaus                0    1    1
##   Tulare                    2    3    5
##   Tulare County             1    0    1
##   Ventura                   6    6   12
##   Sum                     540  469 1009

cat("\n")

Descriptive Analysis: Visualization

###By Admitted Honors Cohort:
data %>%
  dplyr::count(`Admitted Honors Cohort`,`Capstone Project Completion`) %>%
  group_by(`Admitted Honors Cohort`) %>%
  ggplot(aes(reorder(`Admitted Honors Cohort`, -n),n,fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3.5, position = position_stack(vjust = 0.5),colour = "white") +
  theme(axis.text.x = element_text(angle=90, hjust=1)) +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by Admitted Honors Cohort",x="Admitted Honors Cohort",y="Frequency of Students")

###By UCR Entry Action:
data %>%
  dplyr::count(`UCR Entry Action`,`Capstone Project Completion`) %>%
  group_by(`UCR Entry Action`) %>%
  ggplot(aes(reorder(`UCR Entry Action`, -n),n,fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3.5, position = position_stack(vjust = 0.5),colour = "white") +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by UCR Entry Action",x="UCR Entry Action",y="Frequency of Students")

###By College:
data %>%
  dplyr::count(`College`,`Capstone Project Completion`) %>%
  group_by(`College`) %>%
  ggplot(aes(reorder(`College`, -n),n,fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3.5, position = position_stack(vjust = 0.5),colour = "white") +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by College",x="College",y="Frequency of Students")

###By Income
data %>%
  dplyr::count(`Low Income`,`Capstone Project Completion`) %>%
  group_by(`Low Income`) %>%
  ggplot(aes(`Low Income`,n,fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3.5, position = position_stack(vjust = 0.5),colour = "white") +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by Income",y="Frequency of Students")

###By First Generation status
data %>%
  dplyr::count(`First Generation`,`Capstone Project Completion`) %>%
  group_by(`First Generation`) %>%
  ggplot(aes(`First Generation`,n,fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3.5, position = position_stack(vjust = 0.5),colour = "white") +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by First Generation status",y="Frequency of Students")

###By Ethnicity
data %>%
  dplyr::count(`IPEDS Ethnicity`,`Capstone Project Completion`) %>%
  group_by(`IPEDS Ethnicity`) %>%
  ggplot(aes(n,reorder(`IPEDS Ethnicity`, n),fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by Ethnicity",x="Frequency of Students",y="Ethnicity")

###By Gender
data %>%
  dplyr::count(`Gender`,`Capstone Project Completion`) %>%
  group_by(`Gender`) %>%
  ggplot(aes(`Gender`,n,fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3.5, position = position_stack(vjust = 0.5),colour = "white") +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by Gender", y="Frequency of Students")

###By Admit Type
data %>%
  dplyr::count(`Admit Type`,`Capstone Project Completion`) %>%
  group_by(`Admit Type`) %>%
  ggplot(aes(reorder(`Admit Type`, n),n,n,fill=factor(`Capstone Project Completion`),label = n)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3.5, position = position_stack(vjust = 0.5),colour = "white") +
  scale_fill_manual(labels = c("No", "Yes"),values = c("0"="#E41A1C", "1"="#377EB8"),name = "Capstone Completed") +
  labs(title="Barplot of Capstone Completion by Admit Type", y="Frequency of Students")

### By Evaluated HS GPA
##sum(is.na(data$`Evaluated High School GPA`))
##number of na = 267
gpa_viz<-ggplot(data, aes(x=`Evaluated High School GPA`)) + 
  geom_histogram(color="white", fill="#377EB8")

gpa_viz + labs(title="Histogram of Honor Students GPA",
                   x="GPA",
                   y="Number of Students")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 267 rows containing non-finite values (stat_bin).

Data Analysis:

# series of chi-square tests for categorical data
# tests if capstone completion and these variables have significant association with each other
options(scipen=999)
t1 <- chisq.test(data$`Capstone Project Completion`, data$`Gender`)

## Warning in chisq.test(data$`Capstone Project Completion`, data$Gender): Chi-
## squared approximation may be incorrect

t2 <- chisq.test(data$`Capstone Project Completion`, data$`IPEDS Ethnicity`)
t3 <- chisq.test(data$`Capstone Project Completion`, data$`Low Income`)
t4 <- chisq.test(data$`Capstone Project Completion`, data$`First Generation`)
t5 <- chisq.test(data$`Capstone Project Completion`, data$`Admit Type`)
t6 <- chisq.test(data$`Capstone Project Completion`, data$`College`)
t7 <- chisq.test(data$`Capstone Project Completion`, data$`UCR Entry Action`)
t8 <- chisq.test(data$`Capstone Project Completion`, data$`Admitted Honors Cohort`)
t9 <- chisq.test(data$`Capstone Project Completion`, data$`Highlander Scholarship`)
t10 <- chisq.test(data$`Capstone Project Completion`, data$`Chancellor's Scholarship`)
t11 <- chisq.test(data$`Capstone Project Completion`, data$`Regents' Scholarship`)
t12 <- chisq.test(data$`Capstone Project Completion`, data$`Permanent.Region`)

## Warning in chisq.test(data$`Capstone Project Completion`,
## data$Permanent.Region): Chi-squared approximation may be incorrect

t13 <- chisq.test(data$`Capstone Project Completion`, data$`Permanent.County`)

## Warning in chisq.test(data$`Capstone Project Completion`,
## data$Permanent.County): Chi-squared approximation may be incorrect

tab <- map_df(list(t1,t2,t3,t4,t5,t6,t7,t8,t9,t10, t11, t12, t13), tidy)
p1 <- tab[c("p.value")]
variables <- c("Gender","IPEDS Ethnicity", "Low Income", "First Generation", "Admit Type", "College", "UCR Entry Action", "Admitted Honors Cohort", "Highlander Scholarship", "Chancellor's Scholarship", "Regents' Scholarship", "Permanent Region", "Permanent County")
conclusion <- c("not significant","not significant","not significant","significant","not significant","not significant","not significant","significant", "not significant", "significant", "significant", "not significant","significant")
cbind(variables,p1,conclusion)

##                   variables                   p.value      conclusion
## 1                    Gender 0.76460137472442890604896 not significant
## 2           IPEDS Ethnicity 0.07930674656709743586269 not significant
## 3                Low Income 0.28869858577139351218932 not significant
## 4          First Generation 0.00020253892438389735331     significant
## 5                Admit Type 0.57265602771752177613251 not significant
## 6                   College 0.57000217789489582287388 not significant
## 7          UCR Entry Action 0.36289978937173972273200 not significant
## 8    Admitted Honors Cohort 0.00000000000000002192658     significant
## 9    Highlander Scholarship 0.10506833356879220764402 not significant
## 10 Chancellor's Scholarship 0.00003976320327142305526     significant
## 11     Regents' Scholarship 0.01081965404658243729008     significant
## 12         Permanent Region 0.42322303678668604653978 not significant
## 13         Permanent County 0.01403408682722000026810     significant

# separate students that completed capstone project and those who didn't

capstone <- data %>%
  filter(`Capstone Project Completion` == 1)

no.capstone <- data %>%
  filter(`Capstone Project Completion` == 0)


# capstone <- data %>%
#   filter(`Capstone Project Completion` == 1) %>%
#   pull(`Evaluated High School GPA`)
# no.capstone <- data %>%
#   filter(`Capstone Project Completion` == 0) %>%
#   pull(`Evaluated High School GPA`)

## F test
## H0: σ1 = σ2
## Ha: σ1 ≠ σ2

## two sample t test
## H0: µ1 = µ2
## Ha: µ1 ≠ µ2

# two-sample independent t-test to test if the mean is significantly different between students who completed capstone projects and those who didn’t
var.test(capstone$`Evaluated High School GPA`, no.capstone$`Evaluated High School GPA`, alternative = "two.sided") # reject H0 -> so σ1 ≠ σ2

## 
##  F test to compare two variances
## 
## data:  capstone$`Evaluated High School GPA` and no.capstone$`Evaluated High School GPA`
## F = 1.4152, num df = 350, denom df = 396, p-value = 0.0008058
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.155210 1.736054
## sample estimates:
## ratio of variances 
##           1.415242

t.test(capstone$`Evaluated High School GPA`, no.capstone$`Evaluated High School GPA`, alternative = "two.sided", var.equal = FALSE) # fail to reject H0 ->  average gpa of students who completed capstone project vs those who didn't are not significantly different

## 
##  Welch Two Sample t-test
## 
## data:  capstone$`Evaluated High School GPA` and no.capstone$`Evaluated High School GPA`
## t = 0.092699, df = 686.95, p-value = 0.9262
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03700090  0.04066786
## sample estimates:
## mean of x mean of y 
##  3.904302  3.902469

print("fail to reject H0 ->  average gpa of students who completed capstone project vs those who didn't are not significantly different")

## [1] "fail to reject H0 ->  average gpa of students who completed capstone project vs those who didn't are not significantly different"

set.seed(1)

data[sapply(data, is.character)] <- lapply(data[sapply(data, is.character)], as.factor)
sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.7,0.3))
train <- data[sample, ]
test <- data[!sample, ]  

model <- glm(`Capstone Project Completion` ~ `Admitted Honors Cohort` + `First Generation`+ `Chancellor's Scholarship` + `Regents' Scholarship` + Permanent.County, family = "binomial", data = train)

summary(model)

## 
## Call:
## glm(formula = `Capstone Project Completion` ~ `Admitted Honors Cohort` + 
##     `First Generation` + `Chancellor's Scholarship` + `Regents' Scholarship` + 
##     Permanent.County, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1369  -1.0079  -0.5184   1.0706   2.0339  
## 
## Coefficients:
##                                            Estimate Std. Error z value
## (Intercept)                                 -0.7603     0.5807  -1.309
## `Admitted Honors Cohort`Incoming Transfer    0.7661     0.2791   2.745
## `Admitted Honors Cohort`Others               1.6086     0.3178   5.061
## `Admitted Honors Cohort`Second Year          0.9682     0.2460   3.936
## `Admitted Honors Cohort`Third Year           1.2981     0.2682   4.840
## `First Generation`Y                         -0.1890     0.1735  -1.089
## `Chancellor's Scholarship`Y                  0.5940     0.3292   1.804
## `Regents' Scholarship`Y                      1.6060     0.6910   2.324
## Permanent.CountyAlameda County             -16.8527  1675.7452  -0.010
## Permanent.CountyContra Costa                 0.3028     1.0937   0.277
## Permanent.CountyImperial                   -16.5849  2399.5448  -0.007
## Permanent.CountyKern                        -1.9286     1.2661  -1.523
## Permanent.CountyKings                      -17.1038  2399.5448  -0.007
## Permanent.CountyLos Angeles                 -0.2293     0.5828  -0.393
## Permanent.CountyLos Angeles County          -0.2542     0.6954  -0.366
## Permanent.CountyMarin                       16.1960  1687.5686   0.010
## Permanent.CountyMerced                       0.2058     1.5725   0.131
## Permanent.CountyMonterey County            -16.5849  2399.5448  -0.007
## Permanent.CountyOrange                      -0.2710     0.6233  -0.435
## Permanent.CountyOrange County                1.3219     1.0033   1.318
## Permanent.Countyother                       -0.8279     1.3648  -0.607
## Permanent.CountyOther                        0.3208     1.0179   0.315
## Permanent.Countyothers                      -0.6167     0.9919  -0.622
## Permanent.CountyOthers                     -17.4143  2399.5448  -0.007
## Permanent.CountyPlacer                     -16.7739  2399.5448  -0.007
## Permanent.CountyRiverside                   -0.4331     0.5846  -0.741
## Permanent.CountyRIverside                    0.9341     1.3993   0.668
## Permanent.CountyRiverside County            -0.9838     0.6844  -1.437
## Permanent.CountySacramento                  16.5028  1350.6475   0.012
## Permanent.CountySacramento County           15.7178  2399.5448   0.007
## Permanent.CountySan Bernadino                0.6903     0.8364   0.825
## Permanent.CountySan Bernadino County         1.3300     0.7083   1.878
## Permanent.CountySan Bernardino              -0.1583     0.6041  -0.262
## Permanent.CountySan Diego                   -0.9900     0.6946  -1.425
## Permanent.CountySan Diego County            16.2303   961.4852   0.017
## Permanent.CountySan Francisco               -1.5247     1.2963  -1.176
## Permanent.CountySan Francisco County       -17.1038  2399.5448  -0.007
## Permanent.CountySan Joaquin                -15.6167  2399.5448  -0.007
## Permanent.CountySan Joaquin County         -15.8057  2399.5448  -0.007
## Permanent.CountySan Luis Obispo County      16.5604  2399.5448   0.007
## Permanent.CountySanta Barbara                0.2058     1.6138   0.128
## Permanent.CountySanta Barbra County         16.0283  2399.5448   0.007
## Permanent.CountySanta Clara                  0.5781     0.7275   0.795
## Permanent.CountySanta Clara County         -16.4128  1348.2026  -0.012
## Permanent.CountySanta Cruz                  16.3582  2399.5448   0.007
## Permanent.CountyStanislaus                  15.3129  2399.5448   0.006
## Permanent.CountyTulare                       0.3657     1.1026   0.332
## Permanent.CountyVentura                     -0.7961     1.0330  -0.771
##                                              Pr(>|z|)    
## (Intercept)                                   0.19039    
## `Admitted Honors Cohort`Incoming Transfer     0.00605 ** 
## `Admitted Honors Cohort`Others            0.000000416 ***
## `Admitted Honors Cohort`Second Year       0.000082968 ***
## `Admitted Honors Cohort`Third Year        0.000001299 ***
## `First Generation`Y                           0.27608    
## `Chancellor's Scholarship`Y                   0.07117 .  
## `Regents' Scholarship`Y                       0.02011 *  
## Permanent.CountyAlameda County                0.99198    
## Permanent.CountyContra Costa                  0.78191    
## Permanent.CountyImperial                      0.99449    
## Permanent.CountyKern                          0.12767    
## Permanent.CountyKings                         0.99431    
## Permanent.CountyLos Angeles                   0.69398    
## Permanent.CountyLos Angeles County            0.71472    
## Permanent.CountyMarin                         0.99234    
## Permanent.CountyMerced                        0.89587    
## Permanent.CountyMonterey County               0.99449    
## Permanent.CountyOrange                        0.66372    
## Permanent.CountyOrange County                 0.18765    
## Permanent.Countyother                         0.54412    
## Permanent.CountyOther                         0.75261    
## Permanent.Countyothers                        0.53412    
## Permanent.CountyOthers                        0.99421    
## Permanent.CountyPlacer                        0.99442    
## Permanent.CountyRiverside                     0.45878    
## Permanent.CountyRIverside                     0.50441    
## Permanent.CountyRiverside County              0.15060    
## Permanent.CountySacramento                    0.99025    
## Permanent.CountySacramento County             0.99477    
## Permanent.CountySan Bernadino                 0.40923    
## Permanent.CountySan Bernadino County          0.06043 .  
## Permanent.CountySan Bernardino                0.79332    
## Permanent.CountySan Diego                     0.15405    
## Permanent.CountySan Diego County              0.98653    
## Permanent.CountySan Francisco                 0.23951    
## Permanent.CountySan Francisco County          0.99431    
## Permanent.CountySan Joaquin                   0.99481    
## Permanent.CountySan Joaquin County            0.99474    
## Permanent.CountySan Luis Obispo County        0.99449    
## Permanent.CountySanta Barbara                 0.89852    
## Permanent.CountySanta Barbra County           0.99467    
## Permanent.CountySanta Clara                   0.42679    
## Permanent.CountySanta Clara County            0.99029    
## Permanent.CountySanta Cruz                    0.99456    
## Permanent.CountyStanislaus                    0.99491    
## Permanent.CountyTulare                        0.74018    
## Permanent.CountyVentura                       0.44090    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 962.56  on 695  degrees of freedom
## Residual deviance: 819.15  on 648  degrees of freedom
##   (11 observations deleted due to missingness)
## AIC: 915.15
## 
## Number of Fisher Scoring iterations: 15

model$xlevels[["Permanent.County"]] <- union(model$xlevels[["Permanent.County"]], levels(test$Permanent.County))

pR2(model)["McFadden"]

## fitting null model for pseudo-r2

##  McFadden 
## 0.1489914

predicted <- predict(model, test, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

predicted

##                1                2                3                4 
## 0.63128364050167 0.48248863495859 0.69425253494619 0.26282203605030 
##                5                6                7                8 
## 0.64043091410467 0.49463573668957 0.56628569017398 0.43558712342004 
##                9               10               11               12 
## 0.48421377589113 0.39474643459579 0.71056140244713 0.27098236333172 
##               13               14               15               16 
## 0.27464712992695 0.27901468667326 0.66729500026103 0.23529298885274 
##               17               18               19               20 
## 0.27901468667326 0.59523595996519 0.57649971848523 0.44392002703446 
##               21               22               23               24 
## 0.43558712342004 0.23529298885274 0.62495359893500 0.52612888999849 
##               25               26               27               28 
## 0.73865192341826 0.60231806352256 0.31385582260057 0.56628569017398 
##               29               30               31               32 
## 0.77346671298049 0.56628569017398 0.52612888999849 0.38073388865796 
##               33               34               35               36 
## 0.20061242562509 0.35059588758209 0.35059588758209 0.43406079332750 
##               37               38               39               40 
## 0.23264175938907 0.69425253494619 0.23264175938907 0.52612888999849 
##               41               42               43               44 
## 0.44757465276438 0.20061242562509 0.20061242562509 0.64997809771371 
##               45               46               47               48 
## 0.51303521446204 0.38833434233277 0.49463573668957 0.73284268435016 
##               49               50               51               52 
## 0.99999976902488 0.12640114156411 0.84273213962884 0.23737764521967 
##               53               54               55               56 
## 0.99999994779239 0.41956151723541 0.23084662033193 0.23084662033193 
##               57               58               59               60 
## 0.12640114156411 0.55177599692566 0.27901468667326 0.75742394643531 
##               61               62               63               64 
## 0.57041512458506 0.77785210122331 0.99999994779239 0.39233998735734 
##               65               66               67               68 
## 0.52361440970679 0.12640114156411 0.48841869656955 0.64429783069062 
##               69               70               71               72 
## 0.00000005904950 0.48841869656955 0.39233998735734 0.14878737194285 
##               73               74               75               76 
## 0.48971753285919 0.84273213962884 0.00000017394363 0.43820020044767 
##               77               78               79               80 
## 0.75890470958106 0.59402596686064 0.00000008715774 0.31857304737321 
##               81               82               83               84 
## 0.87465728194068 0.23737764521967 0.97777932712586 0.12640114156411 
##               85               86               87               88 
## 0.89827818683266 0.23737764521967 0.53690312583771 0.82315010171942 
##               89               90               91               92 
## 0.82315010171942 0.84273213962884 0.00000002882166 0.75890470958106 
##               93               94               95               96 
## 0.41893718274061 0.75890470958106 0.23737764521967 0.27588332819478 
##               97               98               99              100 
## 0.27901468667326 0.90246915609572 0.99999979675735 0.23529298885274 
##              101              102              103              104 
## 0.39828618605924 0.69425253494619 0.55628938050162 0.64043091410467 
##              105              106              107              108 
## 0.51239232149888 0.35785160989078 0.51239232149888 0.46519695983301 
##              109              110              111              112 
## 0.68696234535168 0.44392002703446 0.44757465276438 0.20061242562509 
##              113              114              115              116 
## 0.39828618605924 0.60585596244420 0.69425253494619 0.39788515833969 
##              117              118              119              120 
## 0.65555317529583 0.39474643459579 0.52981548174707 0.41542722888657 
##              121              122              123              124 
## 0.64997809771371 0.44392002703446 0.22787147003986 0.60585596244420 
##              125              126              127              128 
## 0.56628569017398 0.63128364050167 0.78311979192397 0.38881987935297 
##              129              130              131              132 
## 0.37433506334654 0.48421377589113 0.55628938050162 0.49463573668957 
##              133              134              135              136 
## 0.23264175938907 0.20061242562509 0.20061242562509 0.55628938050162 
##              137              138              139              140 
## 0.55565635918266 0.65908802105977 0.12753500546129 0.23577661316400 
##              141              142              143              144 
## 0.44392002703446 0.39474643459579 0.44433321327261 0.44433321327261 
##              145              146              147              148 
## 0.46519695983301 0.56628569017398 0.57649971848523 0.22787147003986 
##              149              150              151              152 
## 0.34832522570697 0.55628938050162 0.65555317529583 0.49463573668957 
##              153              154              155              156 
## 0.65613398742937 0.24831412234201 0.57649971848523 0.52981548174707 
##              157              158              159              160 
## 0.66596253371309 0.52612888999849 0.47891096206494 0.77081223272953 
##              161              162              163              164 
## 0.55565635918266 0.55628938050162 0.66596253371309 0.31385582260057 
##              165              166              167              168 
## 0.27901468667326 0.60231806352256 0.35059588758209 0.22787147003986 
##              169              170              171              172 
## 0.44757465276438 0.15176872911268 0.23625769167886 0.48421377589113 
##              173              174              175              176 
## 0.51239232149888 0.35059588758209 0.23264175938907 0.55628938050162 
##              177              178              179              180 
## 0.46519695983301 0.84680966727698 0.34376160026625 0.39474643459579 
##              181              182              183              184 
## 0.64997809771371               NA 0.56628569017398 0.23529298885274 
##              185              186              187              188 
## 0.46193461459342 0.44392002703446 0.57649971848523 0.56628569017398 
##              189              190              191              192 
## 0.45430183258386 0.24831412234201 0.46193461459342 0.52981548174707 
##              193              194              195              196 
## 0.46193461459342 0.12571846397965 0.75321524991616 0.77081223272953 
##              197              198              199              200 
## 0.39788515833969 0.46519695983301 0.60585596244420 0.00000006389220 
##              201              202              203              204 
##               NA 0.59374236973381 0.46519695983301 0.77081223272953 
##              205              206              207              208 
## 0.51239232149888 0.28524183919705 0.39828618605924 0.35059588758209 
##              209              210              211              212 
## 0.59374236973381 0.22787147003986 0.23264175938907 0.39474643459579 
##              213              214              215              216 
## 0.23264175938907 0.27098236333172 0.52612888999849 0.18155406030782 
##              217              218              219              220 
## 0.09237095963633 0.57649971848523 0.39185822005073 0.73284268435016 
##              221              222              223              224 
## 0.59374236973381 0.00000005219889 0.27464712992695 0.35059588758209 
##              225              226              227              228 
## 0.39788515833969 0.49463573668957 0.41542722888657 0.66596253371309 
##              229              230              231              232 
## 0.27098236333172 0.31857304737321 0.63128364050167 0.23529298885274 
##              233              234              235              236 
## 0.35445952570614 0.18155406030782 0.39788515833969 0.55177599692566 
##              237              238              239              240 
## 0.55628938050162 0.64043091410467               NA 0.35059588758209 
##              241              242              243              244 
## 0.49463573668957 0.46519695983301 0.39474643459579 0.46519695983301 
##              245              246              247              248 
## 0.80879666758164 0.20061242562509 0.64997809771371 0.41805320146836 
##              249              250              251              252 
## 0.31248851293247 0.20061242562509 0.52612888999849 0.55177599692566 
##              253              254              255              256 
## 0.57649971848523 0.73284268435016 0.27901468667326 0.64997809771371 
##              257              258              259              260 
## 0.38881987935297 0.27901468667326 0.51303521446204 0.46193461459342 
##              261              262              263              264 
## 0.39474643459579 0.69857181683979 0.31385582260057 0.24831412234201 
##              265              266              267              268 
## 0.35059588758209 0.43576454174099 0.23529298885274 0.64043091410467 
##              269              270              271              272 
## 0.20061242562509 0.62495359893500 0.39474643459579 0.65555317529583 
##              273              274              275              276 
## 0.20061242562509               NA 0.46519695983301 0.68696234535168 
##              277              278              279              280 
## 0.73572887517637 0.69425253494619 0.24831412234201 0.39474643459579 
##              281              282              283              284 
## 0.24831412234201 0.66596253371309 0.72580030241428 0.47891096206494 
##              285              286              287              288 
## 0.26674260171740               NA 0.88291379043469 0.44757465276438 
##              289              290              291              292 
## 0.57649971848523 0.00000031918926 0.35059588758209 0.24831412234201 
##              293              294              295              296 
## 0.23529298885274 0.23529298885274 0.76336098232387 0.14862181185923 
##              297              298              299              300 
## 0.39474643459579 0.60231806352256 0.35059588758209 0.23529298885274 
##              301              302              303              304 
## 0.27901468667326 0.55544920948967 0.50508518121070 0.64997809771371 
##              305              306              307              308 
## 0.35059588758209 0.23264175938907 0.28524183919705 0.52612888999849

confusionMatrix(test$`Capstone Project Completion`, predicted)

##     0  1
## 0 117 59
## 1  50 77

specificity(test$`Capstone Project Completion`, predicted)

## [1] 0.6923077

misClassError(test$`Capstone Project Completion`, predicted)

## [1] 0.3539

plotROC(test$`Capstone Project Completion`, predicted)

STAT183 Group Project Final Analysis

graym004@ucr.edu, vnguy278@ucr.edu, saure001@ucr.edu, jorte030@ucr.edu, ckulk002@ucr.edu

5/7/2022

Cleaning and Processing Data

Descriptive Analysis: Tables

Descriptive Analysis: Visualization

Data Analysis: