#installing readxl package because i will be using an excel file and
#then i have to bring up the library for the package i installed
#install.packages("knitr")
library(knitr)
#install.packages("readxl")
library(readxl)
#Reading and importing data from XL read_excel ("2014-2022 ONLY.xlsx")
#set working directory
#bring up libraries for packages that i will be using
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#install.packages("reader")
library(reader)
## Loading required package: NCmisc
##
## Attaching package: 'reader'
##
## The following objects are masked from 'package:NCmisc':
##
## cat.path, get.ext, rmv.ext
#Changing dataset name and adding it to my environment
library(readxl)
X2014_2022_ONLY <- read_excel("/Users/misschelsita/Documents/Fall 2022/PSY 211/RStuff/2014-2022 ONLY.xlsx")
## New names:
## • `` -> `...21`
## • `` -> `...22`
## • `` -> `...23`
## • `` -> `...24`
#View(X2014_2022_ONLY)
#will create vector for the columns I am interested in so i can trim down the dataset ##the variables I chose to isolate are Gender, race, grade and post self esteem scores ##now I want to select specific columns that I will be working with
WSself <- dplyr::select(X2014_2022_ONLY, GENDER, RACE, GRADE, ROSPST)
summary(WSself)
## GENDER RACE GRADE ROSPST
## Min. :1.000 Min. :1.000 Min. :1.000 Min. : 3.00
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:17.00
## Median :1.000 Median :2.000 Median :3.000 Median :21.00
## Mean :1.494 Mean :1.669 Mean :2.747 Mean :22.47
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:26.00
## Max. :3.000 Max. :2.000 Max. :5.000 Max. :99.00
##The mean for post self esteem was 22.47 across all demographic variables#
library(stats)
range(WSself$ROSPST)
## [1] 3 99
given that 99s have been entered for NAs I would have to remove them to run the range
library(dplyr)
WSselfGrades <-na_if (WSself, "99")
removing missing data
WSselfGrades <- na.omit(WSselfGrades)
range and summary of the clean data
range(WSselfGrades$ROSPST)
## [1] 3 30
summary(WSselfGrades$ROSPST)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 17.00 21.00 20.87 25.00 30.00
table(WSselfGrades$ROSPST)
##
## 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
## 3 3 3 4 6 6 11 7 15 22 18 25 53 42 44 72 53 65 53 54 43 69 49 44 58 32
## 29 30
## 32 70
Selecting only two variables for the plot aspect
WSselfGs<-dplyr::select(WSselfGrades, GRADE,ROSPST)
Changing variable names: the student grades
names(WSselfGs)[names(WSselfGs) == "ROSPST"] <-("RosenbergSelfEsteemPOST")
names(WSselfGs)[names(WSselfGs) == "GRADE"] <-("StudentGrades")
Telling R whats a number versus factor
WSselfGs$StudentGrades <- as.factor(WSselfGs$StudentGrades)
WSselfGrades$RosenbergSelfEsteemPOST <- as.numeric(WSselfGs$RosenbergSelfEsteemPOST)
hist(WSselfGs$RosenbergSelfEsteemPOST, Data="Histogram for Post Self
Esteem Scores", #Title
xlab= "Grade Levels", #X-axis name
ylab="Scores", #y -axis name
border="black", #Bar border color
col="Blue",#Bar color
xlim=c(4,8), #X-axis limits
ylim=c(1,30)) #Y-axis limits
## Warning in plot.window(xlim, ylim, "", ...): "Data" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "Data"
## is not a graphical parameter
## Warning in axis(1, ...): "Data" is not a graphical parameter
## Warning in axis(2, at = yt, ...): "Data" is not a graphical parameter
#CHANGE how the variables are labeled levels
levels (WSselfGs$StudentGrades) <- c("4th","5th","6th","7th","8th")
#boxplot for botg variables of interest
boxplot(WSselfGs$RosenbergSelfEsteemPOST~ WSselfGs$StudentGrades,data=WSselfGs)