HW1.Q1.halatshafer

library('tidyverse')

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Pulling in the data

students <- read.delim("/Users/kylerhalat-shafer/Desktop/UVA/MSDS/STAT 6021/Day 1 - R Basics/students.txt")

Part A.)

You can remove student ID from the data set because it is the same as the number of rows in the dataset, therefore making it not necessary.

students <- students%>% select(-Student)
#can drop the student id column

Part B.)

There is more than 1 way of finding out that there are 249 students, you can take the number of rows in the students data set or create a new data set with just students in it and count.

#using nrow also gives the number of rows in the data set
nrow(students)

## [1] 249

Part C.)

There’s 12 student that have at least something missing in one of the columns.

students[!complete.cases(students),]

Part D.)

students%>%
summarize(median(GPA,na.rm = T),median(GPA, na.rm = T),median(PartyNum, na.rm = T),median(DaysBeer,na.rm = T),median(StudyHrs,na.rm = T))

Part E.)

Overall females study more often than male students based on both the mean and median, whereas there’s a wider variation in study time for males than female.

students%>%
  group_by(Gender)%>%
  summarize(mean(StudyHrs,na.rm = T),median(StudyHrs,na.rm = T),sd(StudyHrs,na.rm = T))

Part F.)

students <- students %>%
  mutate(PartyAnimal = ifelse(students$PartyNum>8, "yes", "no"))
#PartyAnimal

Part G.)

Want to include the right = FALSE parameter because 3 is exclusive in the ‘low’ category

students <- students %>%
  mutate(GPA.cat = cut(students$GPA, breaks = c(-Inf, 3, 3.5, Inf),
                labels = c("low", "moderate", "high"), right = FALSE))

Part H.)

There’s 29 students that have a low GPA (below 3.0), part more than 8 days a month, and studt less than 15 hours a week.

students <- students %>%
  mutate(StudyTime = ifelse(students$StudyHrs>15, "High", "Low"))

poorstudent <- students %>%
  filter(GPA.cat == 'low' & PartyAnimal == 'yes' & StudyHrs < 15)

poorstudent

Part I.)

There’s 87 students in the low category of GPA, 85 in moderate, and 70 in high.

table(students$GPA.cat)

## 
##      low moderate     high 
##       87       85       70

mytab3 <- table(students$GPA.cat)
prop.table(mytab3)

## 
##       low  moderate      high 
## 0.3595041 0.3512397 0.2892562

Part J.)

ggplot(data=subset(students, !is.na(GPA.cat)), aes(x=GPA.cat))+
  geom_bar()+
  theme(plot.title = element_text(hjust = 0.5))+
  labs(x="GPA Category", y="Number of Students", title="GPA Category by Student")

Part K.)

newData <- students%>%
  filter(!is.na(GPA.cat)) %>%
  group_by(GPA.cat)%>% 
  summarize(Counts=n())%>% #counting the number of observations by GPA
  mutate(Percent=Counts/nrow(students)) # then calculating the percent
newData

ggplot(newData, aes(x=GPA.cat, y=Percent))+ #plotting the new data we just made for the proportion
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5))+
  labs(x="GPA Category", y="Percent of Students", title="Dist of GPA")

Part L.)

genderGPA <- table(students$GPA.cat, students$Gender)
prop.table(genderGPA)

##           
##                female       male
##   low      0.16942149 0.19008264
##   moderate 0.21487603 0.13636364
##   high     0.19008264 0.09917355

Part M.)

Female’s consistently have better GPAs

round(prop.table(genderGPA)*100,2)

##           
##            female  male
##   low       16.94 19.01
##   moderate  21.49 13.64
##   high      19.01  9.92

Part N.)

newData2 <- students%>%
  filter(!is.na(GPA.cat)) %>%
  group_by(GPA.cat, Gender)%>% 
  summarize(Counts=n())%>% #counting the number of observations by GPA
  mutate(Percent=Counts/nrow(students)) # then calculating the percent

## `summarise()` has grouped output by 'GPA.cat'. You can override using the
## `.groups` argument.

newData2

ggplot(newData2, aes(x=GPA.cat, y=Percent, fill=Gender))+ #plotting the new data we just made for the proportion
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5))+
  labs(x="GPA Category", y="Percent of Students", title="Dist of GPA")

Part O.)

newData3 <- students%>%
  filter(!is.na(GPA.cat)) %>%
  group_by(GPA.cat, Gender, Smoke)%>% 
  summarize(Counts=n())%>% #counting the number of observations by GPA
  mutate(Percent=Counts/nrow(students)) # then calculating the percent

## `summarise()` has grouped output by 'GPA.cat', 'Gender'. You can override using
## the `.groups` argument.

newData3

More students do not smoke than those that do and among the percent of students, females are higher, in both smoking and non-smoking.

ggplot(newData3, aes(x=GPA.cat, y=Percent, fill=Gender))+ #plotting the new data we just made for the proportion
  geom_bar(stat="identity")+
  facet_wrap(~Smoke)

  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5))+
  labs(x="GPA Category", y="Percent of Students", title="Distribution of GPA by Smoker and Gender")

## List of 5
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.title :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0.5
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ x          : chr "GPA Category"
##  $ y          : chr "Percent of Students"
##  $ title      : chr "Distribution of GPA by Smoker and Gender"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Part P.)

There appears to be a minimal relationship between study hours and GPA, with low and high GPAs being throughout the dataset

students%>%
ggplot(aes(x=GPA, y=StudyHrs))+ 
  geom_point(size = 2, alpha  = 0.3)

## Warning: Removed 7 rows containing missing values (geom_point).

### Part Q.)

students%>%
ggplot(aes(x=StudyHrs, y=GPA, size = PartyNum))+ 
  geom_point(alpha  = 0.3)

## Warning: Removed 12 rows containing missing values (geom_point).

### Part R.)

students%>%
ggplot(aes(x=StudyHrs, y=GPA, size = PartyNum, color=Smoke))+ 
  geom_point(alpha  = 0.5)

## Warning: Removed 12 rows containing missing values (geom_point).