One of the challenges in working with data is wrangling. In this assignment we will use R to perform this task.

Here is a list of data sets: http://vincentarelbundock.github.io/Rdatasets/ (click on the csv index for a list)

Please select one, download it and perform the following tasks:

Load the dataset into R:

#Load the dataset
#theURL <- "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/mediation/student.csv"

#mediation_stud <- read.csv(file = theURL , header = TRUE , sep = ",")

#print(head(mediation_stud))

#Question 7
theURL <- "https://raw.githubusercontent.com/letisalbal/R-Bridge-Homework-2/main/mediationstudent.csv"

mediation_stud <- read.csv(file = theURL , header = TRUE , sep = ",")

print(head(mediation_stud))
##   X fight attachment work score late coed smorale gender income free pared
## 1 1     0          0    0    46    1    1       5      1     10    3     0
## 2 2     1          0    0    48    5    1       5      0      9    3     0
## 3 3     1          1    0    72    3    1       5      1     13    3     1
## 4 4     0          1    1    57    1    1       5      0     12    3     0
## 5 5     0          1    0    51    2    1       5      0     11    3     0
## 6 6     0          1    1    52    2    1       5      0      7    3     0
##   catholic SCH_ID
## 1        0      1
## 2        0      1
## 3        0      1
## 4        0      1
## 5        0      1
## 6        0      1

1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.

# Print Summary
summary(mediation_stud)
##        X            fight          attachment          work       
##  Min.   :   1   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2420   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :4840   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :4840   Mean   :0.1256   Mean   :0.8856   Mean   :0.3852  
##  3rd Qu.:7260   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :9679   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      score            late            coed           smorale     
##  Min.   :19.00   Min.   :1.000   Min.   :0.0000   Min.   :2.000  
##  1st Qu.:45.00   1st Qu.:1.000   1st Qu.:1.0000   1st Qu.:4.000  
##  Median :52.00   Median :2.000   Median :1.0000   Median :4.000  
##  Mean   :51.91   Mean   :2.242   Mean   :0.9413   Mean   :4.017  
##  3rd Qu.:59.00   3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:5.000  
##  Max.   :87.00   Max.   :5.000   Max.   :1.0000   Max.   :5.000  
##      gender           income            free           pared       
##  Min.   :0.0000   Min.   : 1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 8.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :1.0000   Median :10.000   Median :3.000   Median :0.0000  
##  Mean   :0.5211   Mean   : 9.258   Mean   :2.988   Mean   :0.4364  
##  3rd Qu.:1.0000   3rd Qu.:11.000   3rd Qu.:4.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :16.000   Max.   :7.000   Max.   :1.0000  
##     catholic          SCH_ID     
##  Min.   :0.0000   Min.   :  1.0  
##  1st Qu.:0.0000   1st Qu.:142.0  
##  Median :0.0000   Median :285.0  
##  Mean   :0.1426   Mean   :285.5  
##  3rd Qu.:0.0000   3rd Qu.:431.0  
##  Max.   :1.0000   Max.   :568.0
#Print mean and median
means <- sapply(mediation_stud[, c("income", "score")], mean)
medians <- sapply(mediation_stud[, c("income", "score")], median)

means_medianDF <- data.frame(means, medians)
print(means_medianDF)
##            means medians
## income  9.258394      10
## score  51.911768      52

2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.

#Create the new data frame 
media_stud_subset <- mediation_stud[1:10, c("fight", "score", "late", "coed", "gender", "income")]
media_stud_subset
##    fight score late coed gender income
## 1      0    46    1    1      1     10
## 2      1    48    5    1      0      9
## 3      1    72    3    1      1     13
## 4      0    57    1    1      0     12
## 5      0    51    2    1      0     11
## 6      0    52    2    1      0      7
## 7      0    42    3    1      0      7
## 8      0    41    2    1      0      4
## 9      0    28    2    1      1      8
## 10     0    56    1    1      0     12
# Use data.frame to subset the columns and rows

#subset score to be greater than or equal to 65
mediation_stud_subset <- subset(mediation_stud, score >= 65, select = c(score))
head(mediation_stud_subset)
##     score
## 3      72
## 58     67
## 75     70
## 83     65
## 104    66
## 114    66
tail(mediation_stud_subset)
##      score
## 9661    68
## 9665    76
## 9670    66
## 9674    65
## 9676    70
## 9678    68
#subset late to be less than 3
mediation_stud_subset2 <- subset(mediation_stud, late < 3, select = c(late))
head(mediation_stud_subset2)
##   late
## 1    1
## 4    1
## 5    2
## 6    2
## 8    2
## 9    2
tail(mediation_stud_subset2)
##      late
## 9671    2
## 9673    2
## 9674    2
## 9677    1
## 9678    1
## 9679    1
#subset income to be less than or equal to 9
mediation_stud_subset3 <- subset(mediation_stud, income <= 9, select = c(income))
head(mediation_stud_subset3)
##    income
## 2       9
## 6       7
## 7       7
## 8       4
## 9       8
## 12      9
tail(mediation_stud_subset3)
##      income
## 9660      9
## 9664      9
## 9666      7
## 9668      4
## 9671      6
## 9675      8

3. Create new column names for the new data frame.

# Get column names
colnames(media_stud_subset, do.NULL = TRUE, prefix = "col")
## [1] "fight"  "score"  "late"   "coed"   "gender" "income"
# Create new column names
colnames(media_stud_subset) <- c("Fight_sub", "Score_sub", "Late_sub", "CoEd_sub", "Gender_sub", "Income_sub")
colnames(media_stud_subset)
## [1] "Fight_sub"  "Score_sub"  "Late_sub"   "CoEd_sub"   "Gender_sub"
## [6] "Income_sub"

4. Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

# Print summary of data frame
summary(media_stud_subset)
##    Fight_sub     Score_sub       Late_sub       CoEd_sub   Gender_sub  
##  Min.   :0.0   Min.   :28.0   Min.   :1.00   Min.   :1   Min.   :0.00  
##  1st Qu.:0.0   1st Qu.:43.0   1st Qu.:1.25   1st Qu.:1   1st Qu.:0.00  
##  Median :0.0   Median :49.5   Median :2.00   Median :1   Median :0.00  
##  Mean   :0.2   Mean   :49.3   Mean   :2.20   Mean   :1   Mean   :0.30  
##  3rd Qu.:0.0   3rd Qu.:55.0   3rd Qu.:2.75   3rd Qu.:1   3rd Qu.:0.75  
##  Max.   :1.0   Max.   :72.0   Max.   :5.00   Max.   :1   Max.   :1.00  
##    Income_sub   
##  Min.   : 4.00  
##  1st Qu.: 7.25  
##  Median : 9.50  
##  Mean   : 9.30  
##  3rd Qu.:11.75  
##  Max.   :13.00
# Print mean and median
means_subset <- sapply(media_stud_subset[, c("Income_sub", "Score_sub")], mean)
medians_subset <- sapply(media_stud_subset[, c("Income_sub", "Score_sub")], median)

means_medianDF_subset <- data.frame(means_subset, medians_subset)
print(means_medianDF_subset)
##            means_subset medians_subset
## Income_sub          9.3            9.5
## Score_sub          49.3           49.5
# Compare the mean and median from the two
means_medians_compare <- means_medianDF - means_medianDF_subset
# Change names of new mean and median comparison
names(means_medians_compare)[1] <- "Means_difference"
names(means_medians_compare)[2] <- "Medians_difference"
means_medians_compare
##        Means_difference Medians_difference
## income      -0.04160554                0.5
## score        2.61176774                2.5

5. For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.

library("stringr")

vectorx <- media_stud_subset[, "Gender_sub"]
vectorx <- sapply(vectorx, as.character)
unique(vectorx)
## [1] "1" "0"
I only had two different values in many of my columns of my data frame so I decided to replace the numbers “0” and “1” in the Gender_sub column.
#Replacing "0" with "Male" and "1" for "Female"
vectorx <- str_replace(string = vectorx, pattern = "0", "Male")
vectorx <- str_replace(string = vectorx, pattern = "1", "Female")
unique(vectorx)
## [1] "Female" "Male"
media_stud_subset[, "Gender_sub"] <- vectorx
# Print the new data frame
head(media_stud_subset)
##   Fight_sub Score_sub Late_sub CoEd_sub Gender_sub Income_sub
## 1         0        46        1        1     Female         10
## 2         1        48        5        1       Male          9
## 3         1        72        3        1     Female         13
## 4         0        57        1        1       Male         12
## 5         0        51        2        1       Male         11
## 6         0        52        2        1       Male          7

6. Display enough rows to see examples of all of steps 1 - 5 above.

See steps 1 - 5

7. BONUS - place the original.csv in a github file and have R read from the link. This will be very useful skill as you progress in your data science education and career.

See Part 1 for Question 7