library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rm(list=ls())
## When you use '#' at the begining of the line just like this, R will not recognize it as a command.
setwd("/Users/luanpao/desktop/SWRK615") #This is the directory of my computer, you need to change this by checking your working folder.
#You can run the command of each line by highliting the line with your mouse, and clicking 'Run' icon at the right-top of this window
#Please try running the following commands. You don't need to change anything for the moment.
Suppose that you are grading four students from two classes.
exam01 <- c(90, 80, 60,70) # create an exam01 variable
exam01 # To see the values in exam01
## [1] 90 80 60 70
LRP <- c(50, 60, 100, 20)
LRP
## [1] 50 60 100 20
class <- c(1, 1, 2, 2)
class
## [1] 1 1 2 2
create a dataframe that contains exam01 and LRP
grade615 <- data.frame(exam01, LRP, class)
grade615
## exam01 LRP class
## 1 90 50 1
## 2 80 60 1
## 3 60 100 2
## 4 70 20 2
how many studets from class 1 and class 2?
table(grade615$class)
##
## 1 2
## 2 2
mean of each assignemt
mean(grade615$exam01)
## [1] 75
mean(grade615$LRP)
## [1] 57.5
#install.packages("readxl") # You may need to install readxl package if you don't have it in your R
library(readxl)
df <- read_excel("excel_exam.xlsx") #run this line as you did for the above line.
#df is the name of the dataset I just made up. Try giving it a cool name.
#if you correctly roaded your data, you should be able to see "df" (or the name you gave to the dataset) on the right window. You can view your dataset by double clicking "df" on the right window.
Loading datafile (Excel, CSV, Rdata)
# df <- read_excel("file_name.xlsx", col_names = F) # When the first row is not variable names
# df <- read_excel("file_name.xlsx", sheet = 3) # When you have multiple sheets in the spreadsheet
# df <- read.csv("file_name.csv", header = TRUE) # CSV file
# df <- read.csv("file_name.csv", stringsAsFactors = F) # When you have texts in the dataset
#load("file_name.rda")
Take a look at df dataset
df
## # A tibble: 30 x 5
## id class exam rp1 rp2
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 50 98 50
## 2 2 1 60 97 60
## 3 3 1 45 86 78
## 4 4 1 30 98 58
## 5 5 1 25 80 65
## 6 6 1 50 89 98
## 7 7 1 80 90 45
## 8 8 1 90 78 25
## 9 9 1 20 98 15
## 10 10 1 50 98 45
## # ... with 20 more rows
head(df, 30) # to see the first 30 rows of df
## # A tibble: 30 x 5
## id class exam rp1 rp2
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 50 98 50
## 2 2 1 60 97 60
## 3 3 1 45 86 78
## 4 4 1 30 98 58
## 5 5 1 25 80 65
## 6 6 1 50 89 98
## 7 7 1 80 90 45
## 8 8 1 90 78 25
## 9 9 1 20 98 15
## 10 10 1 50 98 45
## # ... with 20 more rows
tail(df, 10) # to see the last 10 rows of df
## # A tibble: 10 x 5
## id class exam rp1 rp2
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 2 75 85 90
## 2 22 2 85 94 88
## 3 23 2 98 93 94
## 4 24 2 92 72 60
## 5 25 2 94 80 90
## 6 26 2 80 94 78
## 7 27 2 95 96 95
## 8 28 2 86 95 90
## 9 29 2 93 85 95
## 10 30 2 91 88 95
#View(df) # view your data frame (you can also click the name of the dataframe on the right top window)
dim(df) # the number of rows (observations) and columns (variables)
## [1] 30 5
str(df) # summary of structure
## Classes 'tbl_df', 'tbl' and 'data.frame': 30 obs. of 5 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ class: num 1 1 1 1 1 1 1 1 1 1 ...
## $ exam : num 50 60 45 30 25 50 80 90 20 50 ...
## $ rp1 : num 98 97 86 98 80 89 90 78 98 98 ...
## $ rp2 : num 50 60 78 58 65 98 45 25 15 45 ...
Try some excersize
# What is the mean of exam?
mean(df$exam)
## [1] 70.06667
# What is the mean of research paper 1(rp1)?
# What is the mean of research paper 2 (rp2)?
summary(df) # to see descriptive stats for each variable in the dataset
## id class exam rp1
## Min. : 1.00 Min. :1.000 Min. : 20.00 Min. :56.00
## 1st Qu.: 8.25 1st Qu.:1.000 1st Qu.: 50.00 1st Qu.:81.25
## Median :15.50 Median :1.000 Median : 77.50 Median :89.50
## Mean :15.50 Mean :1.467 Mean : 70.07 Mean :87.43
## 3rd Qu.:22.75 3rd Qu.:2.000 3rd Qu.: 91.75 3rd Qu.:95.75
## Max. :30.00 Max. :2.000 Max. :100.00 Max. :98.00
## rp2
## Min. :12.0
## 1st Qu.:58.5
## Median :78.0
## Mean :69.0
## 3rd Qu.:90.0
## Max. :98.0
df$final <- (df$exam + df$rp1 + df$rp2)/3
df # can you see the final variable in your dataset?
## # A tibble: 30 x 6
## id class exam rp1 rp2 final
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 50 98 50 66.00000
## 2 2 1 60 97 60 72.33333
## 3 3 1 45 86 78 69.66667
## 4 4 1 30 98 58 62.00000
## 5 5 1 25 80 65 56.66667
## 6 6 1 50 89 98 79.00000
## 7 7 1 80 90 45 71.66667
## 8 8 1 90 78 25 64.33333
## 9 9 1 20 98 15 44.33333
## 10 10 1 50 98 45 64.33333
## # ... with 20 more rows
summary(df$final)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 44.33 65.25 74.17 75.50 87.75 95.33
hist(df$final) # see the distribution of final grade
# Now convert numeric scores into letter grades
df$letter <- ifelse(df$final >= 90, "A",
ifelse(df$final >= 80, "B",
ifelse(df$final >= 70, "c", "fail")))
table(df$letter)
##
## A B c fail
## 6 7 5 12
library(ggplot2)
qplot(df$letter)
df
## # A tibble: 30 x 7
## id class exam rp1 rp2 final letter
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 1 50 98 50 66.00000 fail
## 2 2 1 60 97 60 72.33333 c
## 3 3 1 45 86 78 69.66667 fail
## 4 4 1 30 98 58 62.00000 fail
## 5 5 1 25 80 65 56.66667 fail
## 6 6 1 50 89 98 79.00000 c
## 7 7 1 80 90 45 71.66667 c
## 8 8 1 90 78 25 64.33333 fail
## 9 9 1 20 98 15 44.33333 fail
## 10 10 1 50 98 45 64.33333 fail
## # ... with 20 more rows
df %>% filter(class == 1)
## # A tibble: 16 x 7
## id class exam rp1 rp2 final letter
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 1 50 98 50 66.00000 fail
## 2 2 1 60 97 60 72.33333 c
## 3 3 1 45 86 78 69.66667 fail
## 4 4 1 30 98 58 62.00000 fail
## 5 5 1 25 80 65 56.66667 fail
## 6 6 1 50 89 98 79.00000 c
## 7 7 1 80 90 45 71.66667 c
## 8 8 1 90 78 25 64.33333 fail
## 9 9 1 20 98 15 44.33333 fail
## 10 10 1 50 98 45 64.33333 fail
## 11 11 1 65 65 65 65.00000 fail
## 12 12 1 45 85 32 54.00000 fail
## 13 13 1 46 98 65 69.66667 fail
## 14 14 1 48 87 12 49.00000 fail
## 15 15 1 75 56 78 69.66667 fail
## 16 16 1 58 98 65 73.66667 c
df %>% filter(class != 1) # Cases NOT class 1
## # A tibble: 14 x 7
## id class exam rp1 rp2 final letter
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 17 2 85 90 81 85.33333 B
## 2 18 2 95 80 82 85.66667 B
## 3 19 2 96 90 95 93.66667 A
## 4 20 2 100 80 81 87.00000 B
## 5 21 2 75 85 90 83.33333 B
## 6 22 2 85 94 88 89.00000 B
## 7 23 2 98 93 94 95.00000 A
## 8 24 2 92 72 60 74.66667 c
## 9 25 2 94 80 90 88.00000 B
## 10 26 2 80 94 78 84.00000 B
## 11 27 2 95 96 95 95.33333 A
## 12 28 2 86 95 90 90.33333 A
## 13 29 2 93 85 95 91.00000 A
## 14 30 2 91 88 95 91.33333 A
df %>% filter(class == 1 & letter == "fail")
## # A tibble: 12 x 7
## id class exam rp1 rp2 final letter
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 1 50 98 50 66.00000 fail
## 2 3 1 45 86 78 69.66667 fail
## 3 4 1 30 98 58 62.00000 fail
## 4 5 1 25 80 65 56.66667 fail
## 5 8 1 90 78 25 64.33333 fail
## 6 9 1 20 98 15 44.33333 fail
## 7 10 1 50 98 45 64.33333 fail
## 8 11 1 65 65 65 65.00000 fail
## 9 12 1 45 85 32 54.00000 fail
## 10 13 1 46 98 65 69.66667 fail
## 11 14 1 48 87 12 49.00000 fail
## 12 15 1 75 56 78 69.66667 fail
df %>% filter(class == 1 & final >= 70)
## # A tibble: 4 x 7
## id class exam rp1 rp2 final letter
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2 1 60 97 60 72.33333 c
## 2 6 1 50 89 98 79.00000 c
## 3 7 1 80 90 45 71.66667 c
## 4 16 1 58 98 65 73.66667 c
# Creating a data frame for class 1
df1 <- df %>% filter(class == 1)
df2 <- df %>% filter(class != 1)
mean(df$final)
## [1] 75.5
mean(df1$final)
## [1] 64.45833
mean(df2$final)
## [1] 88.11905
mean(df)
## Warning in mean.default(df): argument is not numeric or logical: returning
## NA
## [1] NA
table(df$class)
##
## 1 2
## 16 14
t.test(data=df, final ~ class, var.equal = T)
##
## Two Sample t-test
##
## data: final by class
## t = -8.2858, df = 28, p-value = 5.13e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -29.51006 -17.81137
## sample estimates:
## mean in group 1 mean in group 2
## 64.45833 88.11905
cor.test(df$exam, df$final) # exam1 and final grade
##
## Pearson's product-moment correlation
##
## data: df$exam and df$final
## t = 7.9642, df = 28, p-value = 1.129e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6752791 0.9177820
## sample estimates:
## cor
## 0.8329149
cor.test(df$rp1, df$final) # first assignment and final grade
##
## Pearson's product-moment correlation
##
## data: df$rp1 and df$final
## t = 0.30882, df = 28, p-value = 0.7597
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3084824 0.4099267
## sample estimates:
## cor
## 0.05826182
cor.test(df$rp2, df$final) # second assignment and final grade
##
## Pearson's product-moment correlation
##
## data: df$rp2 and df$final
## t = 10.165, df = 28, p-value = 6.704e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7741112 0.9452194
## sample estimates:
## cor
## 0.887005
#save your data as csv file
write.csv(df, file = "finalgrade615.csv")
#Or you can save it as RData file
save(df, file = "finalgrade615.rda")