##Data Set https://vincentarelbundock.github.io/Rdatasets/csv/AER/CPSSWEducation.csv https://vincentarelbundock.github.io/Rdatasets/doc/AER/CPSSWEducation.html

Question 1

Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes of your data.

#read dataset by import
#Data_education <- CPSSWEducation 

#read dataset from online site
Data_education <- read.csv(file="https://raw.githubusercontent.com/Benson90/BrigeR/main/CPSSWEducation.csv", header = TRUE, sep = ",")
summary(Data_education)
##        X               age          gender             earnings     
##  Min.   :   1.0   Min.   :29.0   Length:2950        Min.   : 2.137  
##  1st Qu.: 738.2   1st Qu.:29.0   Class :character   1st Qu.:10.577  
##  Median :1475.5   Median :29.0   Mode  :character   Median :14.615  
##  Mean   :1475.5   Mean   :29.5                      Mean   :16.743  
##  3rd Qu.:2212.8   3rd Qu.:30.0                      3rd Qu.:20.192  
##  Max.   :2950.0   Max.   :30.0                      Max.   :97.500  
##    education    
##  Min.   : 6.00  
##  1st Qu.:12.00  
##  Median :13.00  
##  Mean   :13.55  
##  3rd Qu.:16.00  
##  Max.   :18.00
#display mean and median with data frame method 
mean <- sapply(Data_education[c("age","earnings")], mean)
median <- sapply(Data_education[c("age","earnings")], median)

mmFrame <- data.frame(mean,median)
print(mmFrame)
##              mean   median
## age      29.49763 29.00000
## earnings 16.74272 14.61539

Question 2

Create a new data frame with a subset of the columns AND rows. There are several ways to do this so feel free to try a couple if you want. Make sure to rename the new data set so it simply just doesn’t write it over.

#create new data frame
newFrame <- Data_education[1:20, c("age", "gender", "earnings", "education")]

#subset education = 12 
print(subset(newFrame,education == 12))
##    age gender  earnings education
## 3   30 female 13.736263        12
## 6   30 female  8.000000        12
## 7   30   male 19.230770        12
## 10  30 female  6.221719        12
## 15  29   male  5.769231        12
## 16  29   male 13.500000        12

Question 3

Create new column names for each column in the new data frame created in step 2.

#new column name
colnames(newFrame) <- c("new_age", "new_gender", "new_earnings", "new_education")

print(newFrame)
##    new_age new_gender new_earnings new_education
## 1       30       male    34.615383            16
## 2       30     female    19.230770            16
## 3       30     female    13.736263            12
## 4       30     female    13.942307            13
## 5       30     female    19.230770            16
## 6       30     female     8.000000            12
## 7       30       male    19.230770            12
## 8       29       male    26.153847            16
## 9       29       male    26.442308            16
## 10      30     female     6.221719            12
## 11      29       male    24.038462            14
## 12      29       male    38.461540            18
## 13      29       male    19.230770            18
## 14      29     female    10.817307            11
## 15      29       male     5.769231            12
## 16      29       male    13.500000            12
## 17      30       male    12.980769            16
## 18      30       male    35.470085            13
## 19      30       male    18.750000            16
## 20      30     female    24.038462            16

Question 4

Use the summary function to create an overview of your new data frame created in step 2. The print the mean and median for the same two attributes. Please compare (i.e. tell me how the values changed and why).

summary(newFrame)
##     new_age      new_gender         new_earnings    new_education  
##  Min.   :29.0   Length:20          Min.   : 5.769   Min.   :11.00  
##  1st Qu.:29.0   Class :character   1st Qu.:13.370   1st Qu.:12.00  
##  Median :30.0   Mode  :character   Median :19.231   Median :15.00  
##  Mean   :29.6                      Mean   :19.493   Mean   :14.35  
##  3rd Qu.:30.0                      3rd Qu.:24.567   3rd Qu.:16.00  
##  Max.   :30.0                      Max.   :38.462   Max.   :18.00
#display mean and median with data frame method 
newmean <- sapply(newFrame[c("new_age","new_earnings")], mean)
newmedian <- sapply(newFrame[c("new_age","new_earnings")], median)

newmmFrame <- data.frame(newmean,newmedian)
print(newmmFrame)
##               newmean newmedian
## new_age      29.60000  30.00000
## new_earnings 19.49304  19.23077
mmCompare <- mmFrame - newmmFrame
colnames(mmCompare) <- c("compare_mean", "compare_median")
mmCompare
##          compare_mean compare_median
## age        -0.1023729      -1.000000
## earnings   -2.7503209      -4.615385
print("Since the new data frame have less data, the mean and median result will be somewhat different")
## [1] "Since the new data frame have less data, the mean and median result will be somewhat different"

Question 5

For at least 3 different/distinct values in a column please rename so that every value in that column is renamed. For example, change the letter “e” to “excellent”, the letter “a” to “average’ and the word “bad” to “terrible”.

#insert lib
library("stringr")

#show value in variable
newValue <- newFrame[, "new_education"]
newValue <- sapply(newValue, as.character)
unique(newValue)
## [1] "16" "12" "13" "14" "18" "11"
#convert value * Knit shows an error if the pattern doesn't exit in the column.
#newValue <- str_replace(string = newValue, pattern = "9", "Freshman HighSchool")
#newValue <- str_replace(string = newValue, pattern = "10", "Junior HighSchool")
newValue <- str_replace(string = newValue, pattern = "11", "Senior HighSchool")
newValue <- str_replace(string = newValue, pattern = "12", "Freshman Undergrad")
newValue <- str_replace(string = newValue, pattern = "13", "Sophomore Undergrad")
newValue <- str_replace(string = newValue, pattern = "14", "Junior Undergrad")
#newValue <- str_replace(string = newValue, pattern = "15", "Senior Undergrad")
newValue <- str_replace(string = newValue, pattern = "16", "Gradurate student")
#newValue <- str_replace(string = newValue, pattern = "17", "PHD")
newValue <- str_replace(string = newValue, pattern = "18", "MD")
unique(newValue)
## [1] "Gradurate student"   "Freshman Undergrad"  "Sophomore Undergrad"
## [4] "Junior Undergrad"    "MD"                  "Senior HighSchool"
#apply into the data
newFrame[,"new_education"] <-newValue

newFrame
##    new_age new_gender new_earnings       new_education
## 1       30       male    34.615383   Gradurate student
## 2       30     female    19.230770   Gradurate student
## 3       30     female    13.736263  Freshman Undergrad
## 4       30     female    13.942307 Sophomore Undergrad
## 5       30     female    19.230770   Gradurate student
## 6       30     female     8.000000  Freshman Undergrad
## 7       30       male    19.230770  Freshman Undergrad
## 8       29       male    26.153847   Gradurate student
## 9       29       male    26.442308   Gradurate student
## 10      30     female     6.221719  Freshman Undergrad
## 11      29       male    24.038462    Junior Undergrad
## 12      29       male    38.461540                  MD
## 13      29       male    19.230770                  MD
## 14      29     female    10.817307   Senior HighSchool
## 15      29       male     5.769231  Freshman Undergrad
## 16      29       male    13.500000  Freshman Undergrad
## 17      30       male    12.980769   Gradurate student
## 18      30       male    35.470085 Sophomore Undergrad
## 19      30       male    18.750000   Gradurate student
## 20      30     female    24.038462   Gradurate student

Question 6

Display enough rows to see examples of all of steps 1-5 above. This means use a function to show me enough row values that I can see the changes.

Question 7

BONUS – place the original .csv in a github file and have R read from the link. This should be your own github – not the file source. This will be a very useful skill as you progress in your data science education and career.