rm(list=ls())
library(dplyr)
library(ggplot2)
library(corrplot)
library(cowplot)
data <- read.csv("Final_Project.csv")
View(data)
numdata <- select_if(data, is.numeric)
cordata <- cor(numdata)
corrplot(cordata, method = "number", bg = "brown")

I am making a corrplot to check the correlation of some of my
data.
#Making a simplified corrplot with only the variables that were able to be plotted:
simpledata <- numdata %>% select(-c(caffeine_amount, adderall_dosage, study_hrs_exam, study_hrs_no_exam, job_hrs))
simplecordata <- cor(simpledata)
corrplot(simplecordata, method = "number", bg = "brown")

ai_over_50 <- data %>% filter(ai_use > 50)
ai_under_50 <- data %>% filter(ai_use <= 50)
I want to separate the values for AI use over and under 50% just to
see the disparity between the 2 groups.
ggplot()+
geom_point(data = ai_over_50, aes(y = ai_use, x = gpa), color = "red") +
geom_point(data = ai_under_50, aes(y = ai_use, x = gpa), color = "blue") +
labs(x = "GPA", y = "AI Use (%)", title = "AI Use vs. GPA")

Here we can see that there is no real stark difference between the
GPAs of students who use AI more than 50% of the time and students use
use it less than 50% of the time. This is also reflected in our corrplot
since it said that there is practically 0 correlation between the 2.
What this probably means is that students who use AI in their studying
(remember, this variable has nothing to do with use on an exam or the
like, but rather as a primary source of information when studying),
don’t seem to be hindered by its use. This can mean that when used
properly, AI can be a helpful study tool.
a <- ggplot(data)+
geom_bar(aes(x=study_group_size), fill = "blue")
b <- ggplot(data)+
geom_bar(aes(x=gpa), fill = "blue") +
scale_x_continuous(breaks = seq(0, 4, by = 0.5))
c <- ggplot(data)+
geom_point(aes(x = study_group_size, y = gpa), color = "blue") +
labs(x = "Study Group Size", y = "GPA", title = "Study Group Size vs. GPA")
#plot_grid(a, b, c, nrow = 1)
These 2 graphs go hand in hand, since the point graph doesn’t give us
a good representation of the actual number of people who gave an answer,
the barplots do that for us. Looking at all 3 together tell us that most
people study on their own and the majority of people have a GPA of
between about 3.3 and 3.7 (between B+ and A-). If you look at our
corrplot, while they arent super highly correlated, they have a
correlation of -0.23, meaning when study group size goes dow, GPA
slightly goes up. This helps to prove our hypothesis since we predicted
that less people studying together would lead to better grades.
"study hours, gpa, study location"
[1] "study hours, gpa, study location"
"avggpa plot against study_location"
[1] "avggpa plot against study_location"
"study_location plot agasint study_hrs_exam"
[1] "study_location plot agasint study_hrs_exam"
"I am going to make a frequency table that tells me how many times each study location showed up, the create a new data set using those values. I will be creating three different data frames so as to not reuse a dataset that will not work. I aim to plot the frequency of people studying in each location against their gpa's, and additionally I would like to plot how long on average people are studying for."
[1] "I am going to make a frequency table that tells me how many times each study location showed up, the create a new data set using those values. I will be creating three different data frames so as to not reuse a dataset that will not work. I aim to plot the frequency of people studying in each location against their gpa's, and additionally I would like to plot how long on average people are studying for."
table(data$study_location)
Dorm Room/Bedroom Library Living Room/Common Room Other Establishment (Starbucks, cafe, etc)
26 16 7 6
Outside
1
table <- table(data$study_location)
table <- as.data.frame(table)
table_gpa <- table
table_exam <- table
ggplot(table_gpa, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity", fill = "skyblue") +
xlab("Category") +
ylab("Value") +
ggtitle("Bar Plot Example")

#This bar plot shows the frequency of people who study at a certain location
dorm1 <- data %>% filter(study_location == "Dorm Room/Bedroom")
dorm2 <- dorm1 %>% select( study_location, gpa)
avg_gpa_dorm <- mean(dorm2$gpa, na.rm = TRUE)
Warning: argument is not numeric or logical: returning NA
lib1 <- data %>% filter(study_location == "Library")
lib2 <- lib1 %>% select( study_location, gpa)
avg_gpa_lib <- mean(lib2$gpa, na.rm = TRUE)
Warning: argument is not numeric or logical: returning NA
cr1 <- data %>% filter(study_location == "Living Room/Common Room")
cr2 <- cr1 %>% select( study_location, gpa)
avg_gpa_cr <- mean(cr2$gpa, na.rm = TRUE)
Warning: argument is not numeric or logical: returning NA
cafe1 <- data %>% filter(study_location == "Other Establishment (Starbucks, cafe, etc)")
cafe2 <- cafe1 %>% select( study_location, gpa)
avg_gpa_cafe <- mean(cafe2$gpa, na.rm = TRUE)
Warning: argument is not numeric or logical: returning NA
out1 <- data %>% filter(study_location == "Outside")
out2 <- cafe1 %>% select( study_location, gpa)
avg_gpa_out <- mean(out2$gpa, na.rm = TRUE)
Warning: argument is not numeric or logical: returning NA
table_gpa$avg_gpa <- c(avg_gpa_dorm, avg_gpa_lib, avg_gpa_cr, avg_gpa_cafe, avg_gpa_out)
#In the code above, I used the filter function to filter out each option of study location from the main data set titled data, and then selected the gpa column so as to make it into a new dataframe. I then used the gpas of each study location to find the average using the frequency of the study location. Formula being = (sum of gpa's/frequency of study location). I then added a new column into the table_gpa data frame so I can plot everything onto one graph.
ggplot(table_gpa, aes(x = Var1)) +
geom_point(aes(y = avg_gpa), color = "red", size = 3) +
geom_bar(aes(y = Freq), stat = "identity", fill = "skyblue", alpha = 0.5) +
labs(title = "Scatter Plot and Bar Graph",
x = "Category",
y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

#This is the graph that combines frequency of study location with average gpa
https://www.dataquest.io/blog/add-column-to-dataframe-in-r/
dorm_test1 <- data %>% filter(study_location == "Dorm Room/Bedroom")
dorm_test2 <- dorm_test1 %>% select( study_hrs_exam, gpa)
study_test_dorm <- mean(dorm_test2$study_hrs_exam, na.rm = TRUE)
lib_test1 <- data %>% filter(study_location == "Library")
lib_test2 <- lib_test1 %>% select( study_hrs_exam, gpa)
study_test_lib <- mean(lib_test2$study_hrs_exam, na.rm = TRUE)
cr_test1 <- data %>% filter(study_location == "Living Room/Common Room")
cr_test2 <- cr_test1 %>% select( study_hrs_exam, gpa)
study_test_cr <- mean(cr_test2$study_hrs_exam, na.rm = TRUE)
cafe_test1 <- data %>% filter(study_location == "Other Establishment (Starbucks, cafe, etc)")
cafe_test2 <- cafe_test1 %>% select( study_hrs_exam, gpa)
study_test_cafe <- mean(cafe_test2$study_hrs_exam, na.rm = TRUE)
out_test1 <- data %>% filter(study_location == "Outside")
out_test2 <- cafe_test1 %>% select( study_hrs_exam, gpa)
study_test_out <- mean(out_test2$study_hrs_exam, na.rm = TRUE)
table
table_exam$avg_time_study <- c(study_test_dorm, study_test_lib, study_test_cr, study_test_cafe, study_test_out)
#In the code above, I did the same thing as I did with study location and G\gpa except with hours studied at each location.
ggplot(table_exam, aes(x = Var1)) +
geom_point(aes(y = avg_time_study), color = "red", size = 3) +
geom_bar(aes(y = Freq), stat = "identity", fill = "skyblue", alpha = 0.5) +
labs(title = "Scatter Plot and Bar Graph",
x = "Category",
y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

#This is the graph that combines frequency of study location with average hours studied with an exam
---
title: "R Notebook"
output: html_notebook
---


```{r}
rm(list=ls())
library(dplyr)
library(ggplot2)
library(corrplot)
library(cowplot)
```

```{r}
data <- read.csv("Final_Project.csv")
View(data)
```

```{r}
numdata <- select_if(data, is.numeric)
cordata <- cor(numdata)
corrplot(cordata, method = "number", bg = "brown")
```
I am making a corrplot to check the correlation of some of my data.

```{r}
#Making a simplified corrplot with only the variables that were able to be plotted:
simpledata <- numdata %>% select(-c(caffeine_amount, adderall_dosage, study_hrs_exam, study_hrs_no_exam, job_hrs))
simplecordata <- cor(simpledata)
corrplot(simplecordata, method = "number", bg = "brown")

```

```{r}
ai_over_50 <- data %>% filter(ai_use > 50)
ai_under_50 <- data %>% filter(ai_use <= 50)
```
I want to separate the values for AI use over and under 50% just to see the disparity between the 2 groups.


```{r}
ggplot()+
  geom_point(data = ai_over_50, aes(y = ai_use, x = gpa), color = "red") +
  geom_point(data = ai_under_50, aes(y = ai_use, x = gpa), color = "blue") +
  labs(x = "GPA", y = "AI Use (%)", title = "AI Use vs. GPA")
```
Here we can see that there is no real stark difference between the GPAs of students who use AI more than 50% of the time and students use use it less than 50% of the time. This is also reflected in our corrplot since it said that there is practically 0 correlation between the 2. What this probably means is that students who use AI in their studying (remember, this variable has nothing to do with use on an exam or the like, but rather as a primary source of information when studying), don't seem to be hindered by its use. This can mean that when used properly, AI can be a helpful study tool.

```{r}
a <- ggplot(data)+
  geom_bar(aes(x=study_group_size), fill = "blue")

b <- ggplot(data)+
  geom_bar(aes(x=gpa), fill = "blue") +
  scale_x_continuous(breaks = seq(0, 4, by = 0.5))

c <- ggplot(data)+
  geom_point(aes(x = study_group_size, y = gpa), color = "blue") +
  labs(x = "Study Group Size", y = "GPA", title = "Study Group Size vs. GPA")

#plot_grid(a, b, c, nrow = 1)

```
These 2 graphs go hand in hand, since the point graph doesn't give us a good representation of the actual number of people who gave an answer, the barplots do that for us. Looking at all 3 together tell us that most people study on their own and the majority of people have a GPA of between about 3.3 and 3.7 (between B+ and A-). If you look at our corrplot, while they arent super highly correlated, they have a correlation of -0.23, meaning when study group size goes dow, GPA slightly goes up. This helps to prove our hypothesis since we predicted that less people studying together would lead to better grades.

```{r}
"study hours, gpa, study location"

"avggpa plot against study_location"

"study_location plot agasint study_hrs_exam"

"I am going to make a frequency table that tells me how many times each study location showed up, the create a new data set using those values. I will be creating three different data frames so as to not reuse a dataset that will not work. I aim to plot the frequency of people studying in each location against their gpa's, and additionally I would like to plot how long on average people are studying for."

table(data$study_location)

table <- table(data$study_location)

table <- as.data.frame(table)
table_gpa <- table
table_exam <- table

```

```{r}
ggplot(table_gpa, aes(x = Var1, y = Freq)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  xlab("Category") +
  ylab("Value") +
  ggtitle("Bar Plot Example") 
  
#This bar plot shows the frequency of people who study at a certain location
```




```{r}
dorm1 <- data %>% filter(study_location == "Dorm Room/Bedroom")
dorm2 <- dorm1 %>% select( study_location, gpa)
avg_gpa_dorm <- mean(dorm2$gpa, na.rm = TRUE)

lib1 <- data %>% filter(study_location == "Library")
lib2 <- lib1 %>% select( study_location, gpa)
avg_gpa_lib <- mean(lib2$gpa, na.rm = TRUE)

cr1 <- data %>% filter(study_location == "Living Room/Common Room")
cr2 <- cr1 %>% select( study_location, gpa)
avg_gpa_cr <- mean(cr2$gpa, na.rm = TRUE)

cafe1 <- data %>% filter(study_location == "Other Establishment (Starbucks, cafe, etc)")
cafe2 <- cafe1 %>% select( study_location, gpa)
avg_gpa_cafe <- mean(cafe2$gpa, na.rm = TRUE)

out1 <- data %>% filter(study_location == "Outside")
out2 <- cafe1 %>% select( study_location, gpa)
avg_gpa_out <- mean(out2$gpa, na.rm = TRUE)


table_gpa$avg_gpa <- c(avg_gpa_dorm, avg_gpa_lib, avg_gpa_cr, avg_gpa_cafe, avg_gpa_out)

#In the code above, I used the filter function to filter out each option of study location from the main data set titled data, and then selected the gpa column so as to make it into a new dataframe. I then used the gpas of each study location to find the average using the frequency of the study location. Formula being = (sum of gpa's/frequency of study location). I then added a new column into the table_gpa data frame so I can plot everything onto one graph.

ggplot(table_gpa, aes(x = Var1)) +
  geom_point(aes(y = avg_gpa), color = "red", size = 3) +
  geom_bar(aes(y = Freq), stat = "identity", fill = "skyblue", alpha = 0.5) +
  labs(title = "Scatter Plot and Bar Graph",
       x = "Category",
       y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

#This is the graph that combines frequency of study location with average gpa
```
https://www.dataquest.io/blog/add-column-to-dataframe-in-r/ 


```{r}
dorm_test1 <- data %>% filter(study_location == "Dorm Room/Bedroom")
dorm_test2 <- dorm_test1 %>% select( study_hrs_exam, gpa)
study_test_dorm <- mean(dorm_test2$study_hrs_exam, na.rm = TRUE)

lib_test1 <- data %>% filter(study_location == "Library")
lib_test2 <- lib_test1 %>% select( study_hrs_exam, gpa)
study_test_lib <- mean(lib_test2$study_hrs_exam, na.rm = TRUE)

cr_test1 <- data %>% filter(study_location == "Living Room/Common Room")
cr_test2 <- cr_test1 %>% select( study_hrs_exam, gpa)
study_test_cr <- mean(cr_test2$study_hrs_exam, na.rm = TRUE)

cafe_test1 <- data %>% filter(study_location == "Other Establishment (Starbucks, cafe, etc)")
cafe_test2 <- cafe_test1 %>% select( study_hrs_exam, gpa)
study_test_cafe <- mean(cafe_test2$study_hrs_exam, na.rm = TRUE)

out_test1 <- data %>% filter(study_location == "Outside")
out_test2 <- cafe_test1 %>% select( study_hrs_exam, gpa)
study_test_out <- mean(out_test2$study_hrs_exam, na.rm = TRUE)

table
table_exam$avg_time_study <- c(study_test_dorm, study_test_lib, study_test_cr, study_test_cafe, study_test_out)

#In the code above, I did the same thing as I did with study location and G\gpa except with hours studied at each location.

ggplot(table_exam, aes(x = Var1)) +
  geom_point(aes(y = avg_time_study), color = "red", size = 3) +
  geom_bar(aes(y = Freq), stat = "identity", fill = "skyblue", alpha = 0.5) +
  labs(title = "Scatter Plot and Bar Graph",
       x = "Category",
       y = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

#This is the graph that combines frequency of study location with average hours studied with an exam
```


