library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

rm(list=ls())

## When you use '#' at the begining of the line just like this, R will not recognize it as a command.

First, set up your working directory where you have your data file and working R file.

setwd("/Users/luanpao/desktop/SWRK615") #This is the directory of my computer, you need to change this by checking your working folder. 

#You can run the command of each line by highliting the line with your mouse, and clicking 'Run' icon at the right-top of this window
#Please try running the following commands. You don't need to change anything for the moment.

Let’s create a dataframe

Suppose that you are grading four students from two classes.

exam01 <- c(90, 80, 60,70) # create an exam01 variable
exam01 # To see the values in exam01

## [1] 90 80 60 70

LRP <- c(50, 60, 100, 20)
LRP

## [1]  50  60 100  20

class <- c(1, 1, 2, 2)   
class

## [1] 1 1 2 2

create a dataframe that contains exam01 and LRP

grade615 <- data.frame(exam01, LRP, class)
grade615

##   exam01 LRP class
## 1     90  50     1
## 2     80  60     1
## 3     60 100     2
## 4     70  20     2

how many studets from class 1 and class 2?

table(grade615$class)

## 
## 1 2 
## 2 2

mean of each assignemt

mean(grade615$exam01)

## [1] 75

mean(grade615$LRP)

## [1] 57.5

Now load you data file into R

#install.packages("readxl") # You may need to install readxl package if you don't have it in your R
library(readxl)
df <- read_excel("excel_exam.xlsx")  #run this line as you did for the above line.
#df is the name of the dataset I just made up. Try giving it a cool name.
#if you correctly roaded your data, you should be able to see "df" (or the name you gave to the dataset) on the right window. You can view your dataset by double clicking "df" on the right window.

Loading datafile (Excel, CSV, Rdata)

# df <- read_excel("file_name.xlsx", col_names = F) # When the first row is not variable names
# df <- read_excel("file_name.xlsx", sheet = 3) # When you have multiple sheets in the spreadsheet
# df <- read.csv("file_name.csv", header = TRUE) # CSV file
# df <- read.csv("file_name.csv", stringsAsFactors = F) # When you have texts in the dataset

#load("file_name.rda")

Take a look at df dataset

df

## # A tibble: 30 x 5
##       id class  exam   rp1   rp2
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1     1    50    98    50
##  2     2     1    60    97    60
##  3     3     1    45    86    78
##  4     4     1    30    98    58
##  5     5     1    25    80    65
##  6     6     1    50    89    98
##  7     7     1    80    90    45
##  8     8     1    90    78    25
##  9     9     1    20    98    15
## 10    10     1    50    98    45
## # ... with 20 more rows

head(df, 30) # to see the first 30 rows of df

## # A tibble: 30 x 5
##       id class  exam   rp1   rp2
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1     1    50    98    50
##  2     2     1    60    97    60
##  3     3     1    45    86    78
##  4     4     1    30    98    58
##  5     5     1    25    80    65
##  6     6     1    50    89    98
##  7     7     1    80    90    45
##  8     8     1    90    78    25
##  9     9     1    20    98    15
## 10    10     1    50    98    45
## # ... with 20 more rows

tail(df, 10) # to see the last 10 rows of df

## # A tibble: 10 x 5
##       id class  exam   rp1   rp2
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    21     2    75    85    90
##  2    22     2    85    94    88
##  3    23     2    98    93    94
##  4    24     2    92    72    60
##  5    25     2    94    80    90
##  6    26     2    80    94    78
##  7    27     2    95    96    95
##  8    28     2    86    95    90
##  9    29     2    93    85    95
## 10    30     2    91    88    95

#View(df) # view your data frame (you can also click the name of the dataframe on the right top window)
dim(df) # the number of rows (observations) and columns (variables)

## [1] 30  5

str(df) # summary of structure

## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  5 variables:
##  $ id   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ class: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ exam : num  50 60 45 30 25 50 80 90 20 50 ...
##  $ rp1  : num  98 97 86 98 80 89 90 78 98 98 ...
##  $ rp2  : num  50 60 78 58 65 98 45 25 15 45 ...

Try some excersize

# What is the mean of exam?
mean(df$exam)

## [1] 70.06667

# What is the mean of research paper 1(rp1)?

# What is the mean of research paper 2 (rp2)?

summary(df) # to see descriptive stats for each variable in the dataset

##        id            class            exam             rp1       
##  Min.   : 1.00   Min.   :1.000   Min.   : 20.00   Min.   :56.00  
##  1st Qu.: 8.25   1st Qu.:1.000   1st Qu.: 50.00   1st Qu.:81.25  
##  Median :15.50   Median :1.000   Median : 77.50   Median :89.50  
##  Mean   :15.50   Mean   :1.467   Mean   : 70.07   Mean   :87.43  
##  3rd Qu.:22.75   3rd Qu.:2.000   3rd Qu.: 91.75   3rd Qu.:95.75  
##  Max.   :30.00   Max.   :2.000   Max.   :100.00   Max.   :98.00  
##       rp2      
##  Min.   :12.0  
##  1st Qu.:58.5  
##  Median :78.0  
##  Mean   :69.0  
##  3rd Qu.:90.0  
##  Max.   :98.0

Let’s create a variable of final grade

df$final <- (df$exam + df$rp1 + df$rp2)/3
df # can you see the final variable in your dataset?

## # A tibble: 30 x 6
##       id class  exam   rp1   rp2    final
##    <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
##  1     1     1    50    98    50 66.00000
##  2     2     1    60    97    60 72.33333
##  3     3     1    45    86    78 69.66667
##  4     4     1    30    98    58 62.00000
##  5     5     1    25    80    65 56.66667
##  6     6     1    50    89    98 79.00000
##  7     7     1    80    90    45 71.66667
##  8     8     1    90    78    25 64.33333
##  9     9     1    20    98    15 44.33333
## 10    10     1    50    98    45 64.33333
## # ... with 20 more rows

summary(df$final)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   44.33   65.25   74.17   75.50   87.75   95.33

hist(df$final) # see the distribution of final grade

# Now convert numeric scores into letter grades
df$letter <- ifelse(df$final >= 90, "A",
                    ifelse(df$final >= 80, "B",
                           ifelse(df$final >= 70, "c", "fail")))

table(df$letter)

## 
##    A    B    c fail 
##    6    7    5   12

library(ggplot2)

qplot(df$letter)

When you are interested only in your class (Class 1)

df

## # A tibble: 30 x 7
##       id class  exam   rp1   rp2    final letter
##    <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>  <chr>
##  1     1     1    50    98    50 66.00000   fail
##  2     2     1    60    97    60 72.33333      c
##  3     3     1    45    86    78 69.66667   fail
##  4     4     1    30    98    58 62.00000   fail
##  5     5     1    25    80    65 56.66667   fail
##  6     6     1    50    89    98 79.00000      c
##  7     7     1    80    90    45 71.66667      c
##  8     8     1    90    78    25 64.33333   fail
##  9     9     1    20    98    15 44.33333   fail
## 10    10     1    50    98    45 64.33333   fail
## # ... with 20 more rows

df %>% filter(class == 1)

## # A tibble: 16 x 7
##       id class  exam   rp1   rp2    final letter
##    <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>  <chr>
##  1     1     1    50    98    50 66.00000   fail
##  2     2     1    60    97    60 72.33333      c
##  3     3     1    45    86    78 69.66667   fail
##  4     4     1    30    98    58 62.00000   fail
##  5     5     1    25    80    65 56.66667   fail
##  6     6     1    50    89    98 79.00000      c
##  7     7     1    80    90    45 71.66667      c
##  8     8     1    90    78    25 64.33333   fail
##  9     9     1    20    98    15 44.33333   fail
## 10    10     1    50    98    45 64.33333   fail
## 11    11     1    65    65    65 65.00000   fail
## 12    12     1    45    85    32 54.00000   fail
## 13    13     1    46    98    65 69.66667   fail
## 14    14     1    48    87    12 49.00000   fail
## 15    15     1    75    56    78 69.66667   fail
## 16    16     1    58    98    65 73.66667      c

df %>% filter(class != 1) # Cases NOT class 1

## # A tibble: 14 x 7
##       id class  exam   rp1   rp2    final letter
##    <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>  <chr>
##  1    17     2    85    90    81 85.33333      B
##  2    18     2    95    80    82 85.66667      B
##  3    19     2    96    90    95 93.66667      A
##  4    20     2   100    80    81 87.00000      B
##  5    21     2    75    85    90 83.33333      B
##  6    22     2    85    94    88 89.00000      B
##  7    23     2    98    93    94 95.00000      A
##  8    24     2    92    72    60 74.66667      c
##  9    25     2    94    80    90 88.00000      B
## 10    26     2    80    94    78 84.00000      B
## 11    27     2    95    96    95 95.33333      A
## 12    28     2    86    95    90 90.33333      A
## 13    29     2    93    85    95 91.00000      A
## 14    30     2    91    88    95 91.33333      A

df %>% filter(class == 1 & letter == "fail")

## # A tibble: 12 x 7
##       id class  exam   rp1   rp2    final letter
##    <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>  <chr>
##  1     1     1    50    98    50 66.00000   fail
##  2     3     1    45    86    78 69.66667   fail
##  3     4     1    30    98    58 62.00000   fail
##  4     5     1    25    80    65 56.66667   fail
##  5     8     1    90    78    25 64.33333   fail
##  6     9     1    20    98    15 44.33333   fail
##  7    10     1    50    98    45 64.33333   fail
##  8    11     1    65    65    65 65.00000   fail
##  9    12     1    45    85    32 54.00000   fail
## 10    13     1    46    98    65 69.66667   fail
## 11    14     1    48    87    12 49.00000   fail
## 12    15     1    75    56    78 69.66667   fail

df %>% filter(class == 1 & final >= 70)

## # A tibble: 4 x 7
##      id class  exam   rp1   rp2    final letter
##   <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>  <chr>
## 1     2     1    60    97    60 72.33333      c
## 2     6     1    50    89    98 79.00000      c
## 3     7     1    80    90    45 71.66667      c
## 4    16     1    58    98    65 73.66667      c

# Creating a data frame for class 1
df1 <- df %>% filter(class == 1)
df2 <- df %>% filter(class != 1)
mean(df$final)

## [1] 75.5

mean(df1$final)

## [1] 64.45833

mean(df2$final)

## [1] 88.11905

Comparing two classes: T-test

mean(df)

## Warning in mean.default(df): argument is not numeric or logical: returning
## NA

## [1] NA

table(df$class)

## 
##  1  2 
## 16 14

t.test(data=df, final ~ class, var.equal = T)

## 
##  Two Sample t-test
## 
## data:  final by class
## t = -8.2858, df = 28, p-value = 5.13e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -29.51006 -17.81137
## sample estimates:
## mean in group 1 mean in group 2 
##        64.45833        88.11905

Correlation between variables

cor.test(df$exam, df$final) # exam1 and final grade

## 
##  Pearson's product-moment correlation
## 
## data:  df$exam and df$final
## t = 7.9642, df = 28, p-value = 1.129e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6752791 0.9177820
## sample estimates:
##       cor 
## 0.8329149

cor.test(df$rp1, df$final) # first assignment and final grade

## 
##  Pearson's product-moment correlation
## 
## data:  df$rp1 and df$final
## t = 0.30882, df = 28, p-value = 0.7597
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3084824  0.4099267
## sample estimates:
##        cor 
## 0.05826182

cor.test(df$rp2, df$final) # second assignment and final grade

## 
##  Pearson's product-moment correlation
## 
## data:  df$rp2 and df$final
## t = 10.165, df = 28, p-value = 6.704e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7741112 0.9452194
## sample estimates:
##      cor 
## 0.887005

#save your data as csv file
write.csv(df, file = "finalgrade615.csv")
#Or you can save it as RData file 
save(df, file = "finalgrade615.rda")

Basic: Play with datafame

Seongho An