library(plyr)
library(RCurl)
# 1 read in data & print summary
x <- getURL("https://raw.githubusercontent.com/KevinJpotter/edx_capstone/master/Cowles.csv")
df <- read.csv(text = x, header = TRUE, row.names = 'X')
summary(df[,1:2])
## neuroticism extraversion
## Min. : 0.00 Min. : 2.00
## 1st Qu.: 8.00 1st Qu.:10.00
## Median :11.00 Median :13.00
## Mean :11.47 Mean :12.37
## 3rd Qu.:15.00 3rd Qu.:15.00
## Max. :24.00 Max. :23.00
# 2 create new df with less rows & cols
new_df <- subset(df, volunteer == 'no', select = c(neuroticism, extraversion, sex))
cat('The old df size is', dim(df)[1], 'by', dim(df)[2],' The new_df size is', dim(new_df)[1], 'by', dim(new_df)[2])
## The old df size is 1421 by 4 The new_df size is 824 by 3
# 3 rename cols
new_df <- rename(new_df, c('neuroticism' = 'neuro', 'extraversion' = 'extra', 'sex' = 'm/f'))
head(new_df)
## neuro extra m/f
## 1 16 13 female
## 2 8 14 male
## 3 5 16 male
## 4 8 20 female
## 5 9 19 male
## 6 6 15 male
# 4 summary of new df
summary(new_df[,1:2])
## neuro extra
## Min. : 0.00 Min. : 2.00
## 1st Qu.: 8.00 1st Qu.: 9.00
## Median :11.00 Median :12.00
## Mean :11.42 Mean :11.96
## 3rd Qu.:15.00 3rd Qu.:15.00
## Max. :23.00 Max. :23.00
# 5 & 6 rename values and show results
head(new_df)
## neuro extra m/f
## 1 16 13 female
## 2 8 14 male
## 3 5 16 male
## 4 8 20 female
## 5 9 19 male
## 6 6 15 male
new_df$extra <- mapvalues(new_df$extra, from = c(13, 14, 16), to = c(1300, 1400, 1600))
head(new_df , n =10)
## neuro extra m/f
## 1 16 1300 female
## 2 8 1400 male
## 3 5 1600 male
## 4 8 20 female
## 5 9 19 male
## 6 6 15 male
## 7 8 10 female
## 8 12 11 male
## 9 15 1600 male
## 10 18 7 male
# 7 see step 1