library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
students <- read.delim("/Users/kylerhalat-shafer/Desktop/UVA/MSDS/STAT 6021/Day 1 - R Basics/students.txt")
You can remove student ID from the data set because it is the same as the number of rows in the dataset, therefore making it not necessary.
students <- students%>% select(-Student)
#can drop the student id column
There is more than 1 way of finding out that there are 249 students, you can take the number of rows in the students data set or create a new data set with just students in it and count.
#using nrow also gives the number of rows in the data set
nrow(students)
## [1] 249
There’s 12 student that have at least something missing in one of the columns.
students[!complete.cases(students),]
students%>%
summarize(median(GPA,na.rm = T),median(GPA, na.rm = T),median(PartyNum, na.rm = T),median(DaysBeer,na.rm = T),median(StudyHrs,na.rm = T))
Overall females study more often than male students based on both the mean and median, whereas there’s a wider variation in study time for males than female.
students%>%
group_by(Gender)%>%
summarize(mean(StudyHrs,na.rm = T),median(StudyHrs,na.rm = T),sd(StudyHrs,na.rm = T))
students <- students %>%
mutate(PartyAnimal = ifelse(students$PartyNum>8, "yes", "no"))
#PartyAnimal
Want to include the right = FALSE parameter because 3 is exclusive in the ‘low’ category
students <- students %>%
mutate(GPA.cat = cut(students$GPA, breaks = c(-Inf, 3, 3.5, Inf),
labels = c("low", "moderate", "high"), right = FALSE))
There’s 29 students that have a low GPA (below 3.0), part more than 8 days a month, and studt less than 15 hours a week.
students <- students %>%
mutate(StudyTime = ifelse(students$StudyHrs>15, "High", "Low"))
poorstudent <- students %>%
filter(GPA.cat == 'low' & PartyAnimal == 'yes' & StudyHrs < 15)
poorstudent
There’s 87 students in the low category of GPA, 85 in moderate, and 70 in high.
table(students$GPA.cat)
##
## low moderate high
## 87 85 70
mytab3 <- table(students$GPA.cat)
prop.table(mytab3)
##
## low moderate high
## 0.3595041 0.3512397 0.2892562
ggplot(data=subset(students, !is.na(GPA.cat)), aes(x=GPA.cat))+
geom_bar()+
theme(plot.title = element_text(hjust = 0.5))+
labs(x="GPA Category", y="Number of Students", title="GPA Category by Student")
newData <- students%>%
filter(!is.na(GPA.cat)) %>%
group_by(GPA.cat)%>%
summarize(Counts=n())%>% #counting the number of observations by GPA
mutate(Percent=Counts/nrow(students)) # then calculating the percent
newData
ggplot(newData, aes(x=GPA.cat, y=Percent))+ #plotting the new data we just made for the proportion
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle = 90),
plot.title = element_text(hjust = 0.5))+
labs(x="GPA Category", y="Percent of Students", title="Dist of GPA")
genderGPA <- table(students$GPA.cat, students$Gender)
prop.table(genderGPA)
##
## female male
## low 0.16942149 0.19008264
## moderate 0.21487603 0.13636364
## high 0.19008264 0.09917355
Female’s consistently have better GPAs
round(prop.table(genderGPA)*100,2)
##
## female male
## low 16.94 19.01
## moderate 21.49 13.64
## high 19.01 9.92
newData2 <- students%>%
filter(!is.na(GPA.cat)) %>%
group_by(GPA.cat, Gender)%>%
summarize(Counts=n())%>% #counting the number of observations by GPA
mutate(Percent=Counts/nrow(students)) # then calculating the percent
## `summarise()` has grouped output by 'GPA.cat'. You can override using the
## `.groups` argument.
newData2
ggplot(newData2, aes(x=GPA.cat, y=Percent, fill=Gender))+ #plotting the new data we just made for the proportion
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle = 90),
plot.title = element_text(hjust = 0.5))+
labs(x="GPA Category", y="Percent of Students", title="Dist of GPA")
newData3 <- students%>%
filter(!is.na(GPA.cat)) %>%
group_by(GPA.cat, Gender, Smoke)%>%
summarize(Counts=n())%>% #counting the number of observations by GPA
mutate(Percent=Counts/nrow(students)) # then calculating the percent
## `summarise()` has grouped output by 'GPA.cat', 'Gender'. You can override using
## the `.groups` argument.
newData3
More students do not smoke than those that do and among the percent of students, females are higher, in both smoking and non-smoking.
ggplot(newData3, aes(x=GPA.cat, y=Percent, fill=Gender))+ #plotting the new data we just made for the proportion
geom_bar(stat="identity")+
facet_wrap(~Smoke)
theme(axis.text.x = element_text(angle = 90),
plot.title = element_text(hjust = 0.5))+
labs(x="GPA Category", y="Percent of Students", title="Distribution of GPA by Smoker and Gender")
## List of 5
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ x : chr "GPA Category"
## $ y : chr "Percent of Students"
## $ title : chr "Distribution of GPA by Smoker and Gender"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
There appears to be a minimal relationship between study hours and GPA, with low and high GPAs being throughout the dataset
students%>%
ggplot(aes(x=GPA, y=StudyHrs))+
geom_point(size = 2, alpha = 0.3)
## Warning: Removed 7 rows containing missing values (geom_point).
### Part Q.)
students%>%
ggplot(aes(x=StudyHrs, y=GPA, size = PartyNum))+
geom_point(alpha = 0.3)
## Warning: Removed 12 rows containing missing values (geom_point).
### Part R.)
students%>%
ggplot(aes(x=StudyHrs, y=GPA, size = PartyNum, color=Smoke))+
geom_point(alpha = 0.5)
## Warning: Removed 12 rows containing missing values (geom_point).