#installing readxl package because i will be using an excel file and
#then i have to bring up the library for the package i installed

#install.packages("knitr")
library(knitr)
#install.packages("readxl") 
library(readxl)

#Reading and importing data from XL read_excel ("2014-2022 ONLY.xlsx")

#set working directory

#bring up libraries for packages that i will be using 

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
#install.packages("reader")
library(reader)
## Loading required package: NCmisc
## 
## Attaching package: 'reader'
## 
## The following objects are masked from 'package:NCmisc':
## 
##     cat.path, get.ext, rmv.ext

#Changing dataset name and adding it to my environment

library(readxl)
X2014_2022_ONLY <- read_excel("/Users/misschelsita/Documents/Fall 2022/PSY 211/RStuff/2014-2022 ONLY.xlsx")
## New names:
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
#View(X2014_2022_ONLY)

#will create vector for the columns I am interested in so i can trim down the dataset ##the variables I chose to isolate are Gender, race, grade and post self esteem scores ##now I want to select specific columns that I will be working with

WSself <- dplyr::select(X2014_2022_ONLY, GENDER, RACE, GRADE, ROSPST)
summary(WSself)
##      GENDER           RACE           GRADE           ROSPST     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   : 3.00  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:17.00  
##  Median :1.000   Median :2.000   Median :3.000   Median :21.00  
##  Mean   :1.494   Mean   :1.669   Mean   :2.747   Mean   :22.47  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:26.00  
##  Max.   :3.000   Max.   :2.000   Max.   :5.000   Max.   :99.00

##The mean for post self esteem was 22.47 across all demographic variables#

library(stats)
range(WSself$ROSPST) 
## [1]  3 99

given that 99s have been entered for NAs I would have to remove them to run the range

library(dplyr)
WSselfGrades <-na_if (WSself, "99")

removing missing data

WSselfGrades <- na.omit(WSselfGrades)

range and summary of the clean data

range(WSselfGrades$ROSPST)
## [1]  3 30
summary(WSselfGrades$ROSPST)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   17.00   21.00   20.87   25.00   30.00
table(WSselfGrades$ROSPST)
## 
##  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 
##  3  3  3  4  6  6 11  7 15 22 18 25 53 42 44 72 53 65 53 54 43 69 49 44 58 32 
## 29 30 
## 32 70

Selecting only two variables for the plot aspect

WSselfGs<-dplyr::select(WSselfGrades, GRADE,ROSPST)

Changing variable names: the student grades

names(WSselfGs)[names(WSselfGs) == "ROSPST"] <-("RosenbergSelfEsteemPOST")
names(WSselfGs)[names(WSselfGs) == "GRADE"] <-("StudentGrades")

Telling R whats a number versus factor

WSselfGs$StudentGrades <- as.factor(WSselfGs$StudentGrades)
WSselfGrades$RosenbergSelfEsteemPOST <- as.numeric(WSselfGs$RosenbergSelfEsteemPOST)
hist(WSselfGs$RosenbergSelfEsteemPOST, Data="Histogram for Post Self
Esteem Scores", #Title 
xlab= "Grade Levels", #X-axis name 
ylab="Scores", #y -axis name 
border="black", #Bar border color 
col="Blue",#Bar color 
xlim=c(4,8), #X-axis limits 
ylim=c(1,30)) #Y-axis limits
## Warning in plot.window(xlim, ylim, "", ...): "Data" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "Data"
## is not a graphical parameter
## Warning in axis(1, ...): "Data" is not a graphical parameter
## Warning in axis(2, at = yt, ...): "Data" is not a graphical parameter

#CHANGE how the variables are labeled levels

levels (WSselfGs$StudentGrades) <- c("4th","5th","6th","7th","8th")

#boxplot for botg variables of interest

boxplot(WSselfGs$RosenbergSelfEsteemPOST~ WSselfGs$StudentGrades,data=WSselfGs)