R - Assignment #2

7. BONUS – place the original .csv in a github file and have R read from the link. This will be a very

useful skill as you progress in your data science education and career.

data_frame = read.table(file="https://raw.githubusercontent.com/cliftonleesps/r_bridge_wk2_csv/main/students.csv", header=TRUE,sep=",")

1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.

summary(data_frame)

##        X         treatment              low            high      
##  Min.   : 1.0   Length:35          Min.   : 6.0   Min.   : 0.00  
##  1st Qu.: 9.5   Class :character   1st Qu.:13.0   1st Qu.:13.50  
##  Median :18.0   Mode  :character   Median :34.0   Median :28.00  
##  Mean   :18.0                      Mean   :33.8   Mean   :28.06  
##  3rd Qu.:26.5                      3rd Qu.:51.5   3rd Qu.:40.00  
##  Max.   :35.0                      Max.   :66.0   Max.   :68.00

mean_data_frame <- sprintf("%2.2f",mean(data_frame$high))
cat("mean(data_frame$high) = ", mean_data_frame, "\n")

## mean(data_frame$high) =  28.06

median_data_frame <- median(data_frame$low)
cat("median(data_frame$low) = ", median_data_frame, "\n")

## median(data_frame$low) =  34

2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.

subset_data_frame <- subset(data_frame, treatment =="AA" & low >=10)

3. Create new column names for the new data frame.

library(plyr)
subset_data_frame <- rename(subset_data_frame, c("X"="Student", "treatment"="Program", "low"="MinSession", "high"="MaxSession"))

4. Use the summary function to create an overview of your new data frame. The print the mean

and median for the same two attributes. Please compare.

summary(subset_data_frame)

##     Student         Program            MinSession      MaxSession   
##  Min.   : 2.000   Length:9           Min.   :11.00   Min.   :20.00  
##  1st Qu.: 5.000   Class :character   1st Qu.:12.00   1st Qu.:23.00  
##  Median : 7.000   Mode  :character   Median :14.00   Median :28.00  
##  Mean   : 7.889                      Mean   :15.67   Mean   :26.56  
##  3rd Qu.:12.000                      3rd Qu.:18.00   3rd Qu.:30.00  
##  Max.   :14.000                      Max.   :29.00   Max.   :32.00

mean_subset_data_frame <- sprintf("%2.2f",mean(subset_data_frame$MaxSession))
cat("mean(subset_data_frame$MaxSession) = ", mean_subset_data_frame, "\n")

## mean(subset_data_frame$MaxSession) =  26.56

if (mean_subset_data_frame <= mean_data_frame) {
    print("The subset's mean is less than, equal to the original data_frame")
} else {
    print("The subset's mean is high than the original data_frame")
}

## [1] "The subset's mean is less than, equal to the original data_frame"

median_subset_data_frame <- sprintf("%2.2f",mean(subset_data_frame$MinSession))
cat("median(subset_data_frame$MinSession) = ", median_subset_data_frame, "\n")

## median(subset_data_frame$MinSession) =  15.67

if (median_subset_data_frame <= median_data_frame) {
    print("The subset's median is less than, equal to the original data_frame")
} else {
    print("The subset's median is high than the original data_frame")
}

## [1] "The subset's median is less than, equal to the original data_frame"

#### 5. For at least 3 values in a column please rename so that every value in that column is renamed. #### For example, suppose I have 20 values of the letter “e” in one column. Rename those values so #### that all 20 would show as “excellent”.

r require(stringr)

## Loading required package: stringr

r subset_data_frame$Program <- str_replace(string=subset_data_frame$Program, pattern="AA", replacement="Group A") subset_data_frame$Student <- str_replace(string=subset_data_frame$Student, pattern="^8$", replacement="Bob") subset_data_frame$Student <- str_replace(string=subset_data_frame$Student, pattern="^2$", replacement="Gail") subset_data_frame$Student <- str_replace(string=subset_data_frame$Student, pattern="^12$", replacement="Harold")

6. Display enough rows to see examples of all of steps 1-5 above.

subset_data_frame

##    Student Program MinSession MaxSession
## 2     Gail Group A         18         28
## 4        4 Group A         12         20
## 5        5 Group A         15         30
## 6        6 Group A         12         32
## 7        7 Group A         18         31
## 8      Bob Group A         29         25
## 12  Harold Group A         14         30
## 13      13 Group A         11         23
## 14      14 Group A         12         20