#importing libraries and data

#about Dataset 
# This is a survey of 55 Duke University students asked about their GPA, number of hours they study at night, number of nights they go out, and their gender.

setwd("C:/Users/khatrna1/Desktop/EAccess")
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(formattable)
duke_gpa <- read_csv("gpa.csv")
## Rows: 55 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gender
## dbl (4): gpa, studyweek, sleepnight, out
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(duke_gpa)
## [1] 55  5
names(duke_gpa)
## [1] "gpa"        "studyweek"  "sleepnight" "out"        "gender"
View(duke_gpa)
#some cool statistics with heatmap
subset_relation <- duke_gpa[c("gpa", "studyweek", "out", "sleepnight")]
corrrelation_factors <- cor(subset_relation)

heatmap(corrrelation_factors, 
        col = colorRampPalette(c("purple", "white", "orange"))(100), 
        main = "Correlation Heatmap")

#stronger positive correlations are shown in orange, stronger negative correlations are shown in purple, and no correlation is shown in white
#checking patterns of students with highest gpa 
#sorting data wrt highest gpa
duke_gpa_sorted <- duke_gpa[with(duke_gpa, order(-gpa)), ]
print(duke_gpa_sorted)
## # A tibble: 55 × 5
##      gpa studyweek sleepnight   out gender
##    <dbl>     <dbl>      <dbl> <dbl> <chr> 
##  1  4.67        14        6.5   3   male  
##  2  4           25        7     3   female
##  3  4           40        8     3   female
##  4  4           42        5     1   female
##  5  4           20        7     3   female
##  6  3.98         4        7     1.5 female
##  7  3.92        10        8     3   female
##  8  3.92        30        7     2   female
##  9  3.92        10        8     3   female
## 10  3.9         15        6     1   female
## # ℹ 45 more rows
top_ten <- head(duke_gpa_sorted, n=10)
print(top_ten)
## # A tibble: 10 × 5
##      gpa studyweek sleepnight   out gender
##    <dbl>     <dbl>      <dbl> <dbl> <chr> 
##  1  4.67        14        6.5   3   male  
##  2  4           25        7     3   female
##  3  4           40        8     3   female
##  4  4           42        5     1   female
##  5  4           20        7     3   female
##  6  3.98         4        7     1.5 female
##  7  3.92        10        8     3   female
##  8  3.92        30        7     2   female
##  9  3.92        10        8     3   female
## 10  3.9         15        6     1   female
print(paste("Average sudyweek hours of top students = ", mean(top_ten$studyweek)))
## [1] "Average sudyweek hours of top students =  21"
print(paste("Average study hours in night of top students = ", mean(top_ten$sleepnight)))
## [1] "Average study hours in night of top students =  6.95"