# Read FirstYearGPA.csv from github
gpa <- read.csv("https://raw.githubusercontent.com/miachen410/FirstYearGPA/master/FirstYearGPA.csv")
# Explore the dataset by looking at the first few rows
head(gpa)
## X GPA HSGPA SATV SATM Male HU SS FirstGen White CollegeBound
## 1 1 3.06 3.83 680 770 1 3.0 9.0 1 1 1
## 2 2 4.15 4.00 740 720 0 9.0 3.0 0 1 1
## 3 3 3.41 3.70 640 570 0 16.0 13.0 0 0 1
## 4 4 3.21 3.51 740 700 0 22.0 0.0 0 1 1
## 5 5 3.48 3.83 610 610 0 30.5 1.5 0 1 1
## 6 6 2.95 3.25 600 570 0 18.0 3.0 0 1 1
# Identify the types of variables
str(gpa)
## 'data.frame': 219 obs. of 11 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ GPA : num 3.06 4.15 3.41 3.21 3.48 2.95 3.6 2.87 3.67 3.49 ...
## $ HSGPA : num 3.83 4 3.7 3.51 3.83 3.25 3.79 3.6 3.36 3.7 ...
## $ SATV : int 680 740 640 740 610 600 710 390 630 680 ...
## $ SATM : int 770 720 570 700 610 570 630 570 560 670 ...
## $ Male : int 1 0 0 0 0 0 0 0 0 0 ...
## $ HU : num 3 9 16 22 30.5 18 5 10 8.5 16 ...
## $ SS : num 9 3 13 0 1.5 3 19 0 15.5 12 ...
## $ FirstGen : int 1 0 0 0 0 0 0 0 0 0 ...
## $ White : int 1 1 0 1 1 1 1 0 1 1 ...
## $ CollegeBound: int 1 1 1 1 1 1 1 0 1 1 ...
# Generate a summary statistics including means, medians, quartiles, minimum and maximum
summary(gpa)
## X GPA HSGPA SATV
## Min. : 1.0 Min. :1.930 Min. :2.340 Min. :260.0
## 1st Qu.: 55.5 1st Qu.:2.745 1st Qu.:3.170 1st Qu.:565.0
## Median :110.0 Median :3.150 Median :3.500 Median :610.0
## Mean :110.0 Mean :3.096 Mean :3.453 Mean :605.1
## 3rd Qu.:164.5 3rd Qu.:3.480 3rd Qu.:3.760 3rd Qu.:670.0
## Max. :219.0 Max. :4.150 Max. :4.000 Max. :740.0
## SATM Male HU SS
## Min. :430.0 Min. :0.0000 Min. : 0.00 Min. : 0.000
## 1st Qu.:580.0 1st Qu.:0.0000 1st Qu.: 8.00 1st Qu.: 3.000
## Median :640.0 Median :0.0000 Median :13.00 Median : 6.000
## Mean :634.3 Mean :0.4658 Mean :13.11 Mean : 7.249
## 3rd Qu.:690.0 3rd Qu.:1.0000 3rd Qu.:17.00 3rd Qu.:11.000
## Max. :800.0 Max. :1.0000 Max. :40.00 Max. :21.000
## FirstGen White CollegeBound
## Min. :0.0000 Min. :0.00 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.00 1st Qu.:1.0000
## Median :0.0000 Median :1.00 Median :1.0000
## Mean :0.1142 Mean :0.79 Mean :0.9224
## 3rd Qu.:0.0000 3rd Qu.:1.00 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00 Max. :1.0000
All variables are integer or numeric. The average GPA is 3.096 and median GPA is 3.150. The average High School GPA is 3.453, median GPA is 3.5. Percentage of male is 46.58%. Percentage of first generation college student is 11.42%.
# Count numbers of male versus female students
table(gpa$Male)
##
## 0 1
## 117 102
There are 102 male students and 117 female students.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Add a new column called SAT, which combines SATV and SATM
gpa <- mutate(gpa, SAT = SATV + SATM)
# Filter male students
# Display only variables X, GPA, HSGPA, SAT, FirstGen, White
# Assign to new data frame males
males <- gpa %>% filter(Male == 1) %>% select(X, GPA, HSGPA, SAT, FirstGen, White)
# Show first 6 rows of males
head(males)
## X GPA HSGPA SAT FirstGen White
## 1 1 3.06 3.83 1450 1 1
## 2 13 3.85 3.81 1420 0 1
## 3 14 2.58 3.38 1460 0 1
## 4 20 3.16 3.83 1220 0 1
## 5 21 3.78 3.98 1450 0 1
## 6 23 2.81 3.59 1210 0 1
# Scatter Plot to show relationship between HSGPA and GPA in males
plot(males$HSGPA, males$GPA, main = "High School GPA v.s. First Year College GPA of Males", xlab = "High School GPA", ylab = "College GPA")
# Histogram to show the distribution of GPA distribution among males
hist(males$GPA, main = "First Year College GPA Distribution among Males", xlab = "GPA")
boxplot(GPA~Male, data = gpa, names = c("Female", "Male"), main = "First Year College GPA of Females and Males")
library(ggplot2)
# Scatter plot from ggplot2 package, showing relationship between SAT and college GPA for male representing by blue dots, and female representing by black dots
a <- ggplot(gpa, aes(x=SAT, y=GPA, color=Male)) + geom_point() + ggtitle("SAT v.s. College GPA for Male and Female")
a
b <- ggplot(gpa, aes(x=HSGPA, y=GPA, color=Male)) + geom_point() + ggtitle("High School GPA v.s. College GPA for Male and Female")
b