#importing libraries and data
#about Dataset
# This is a survey of 55 Duke University students asked about their GPA, number of hours they study at night, number of nights they go out, and their gender.
setwd("C:/Users/khatrna1/Desktop/EAccess")
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(formattable)
duke_gpa <- read_csv("gpa.csv")
## Rows: 55 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gender
## dbl (4): gpa, studyweek, sleepnight, out
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(duke_gpa)
## [1] 55 5
names(duke_gpa)
## [1] "gpa" "studyweek" "sleepnight" "out" "gender"
View(duke_gpa)
#some cool statistics with heatmap
subset_relation <- duke_gpa[c("gpa", "studyweek", "out", "sleepnight")]
corrrelation_factors <- cor(subset_relation)
heatmap(corrrelation_factors,
col = colorRampPalette(c("purple", "white", "orange"))(100),
main = "Correlation Heatmap")

#stronger positive correlations are shown in orange, stronger negative correlations are shown in purple, and no correlation is shown in white
#checking patterns of students with highest gpa
#sorting data wrt highest gpa
duke_gpa_sorted <- duke_gpa[with(duke_gpa, order(-gpa)), ]
print(duke_gpa_sorted)
## # A tibble: 55 × 5
## gpa studyweek sleepnight out gender
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 4.67 14 6.5 3 male
## 2 4 25 7 3 female
## 3 4 40 8 3 female
## 4 4 42 5 1 female
## 5 4 20 7 3 female
## 6 3.98 4 7 1.5 female
## 7 3.92 10 8 3 female
## 8 3.92 30 7 2 female
## 9 3.92 10 8 3 female
## 10 3.9 15 6 1 female
## # ℹ 45 more rows
top_ten <- head(duke_gpa_sorted, n=10)
print(top_ten)
## # A tibble: 10 × 5
## gpa studyweek sleepnight out gender
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 4.67 14 6.5 3 male
## 2 4 25 7 3 female
## 3 4 40 8 3 female
## 4 4 42 5 1 female
## 5 4 20 7 3 female
## 6 3.98 4 7 1.5 female
## 7 3.92 10 8 3 female
## 8 3.92 30 7 2 female
## 9 3.92 10 8 3 female
## 10 3.9 15 6 1 female
print(paste("Average sudyweek hours of top students = ", mean(top_ten$studyweek)))
## [1] "Average sudyweek hours of top students = 21"
print(paste("Average study hours in night of top students = ", mean(top_ten$sleepnight)))
## [1] "Average study hours in night of top students = 6.95"