One thing that I really appreciate about this is that prior to writing this code I would have to repeatedly change the path for finding my file depending on where I was working (at home I have a mac desktop, a mac laptop, and a PC laptop and at work I have a PC laptop).
My understanding is that relative paths of they type in Python aren’t relevant in R because everything is supposed to reference back to a working directory. I’m still trying to work out how to do that without uploading everything to github, which seems unwieldy.
mydata <- read.csv("https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/Zelig/free1.csv",
header=TRUE)
freedata <- data.frame(mydata)
head(freedata)
## X sex age educ country y v1 v2 v3 v4 v5 v6
## 1 109276 0 20 4 Eurasia 1 4 3 3 5 3 4
## 2 88178 1 25 4 Oceana 2 3 3 5 5 5 5
## 3 111063 1 56 2 Eastasia 2 3 2 4 5 5 4
## 4 161488 0 65 6 Eastasia 2 3 3 5 5 5 5
## 5 44532 1 50 5 Oceana 1 5 3 5 5 3 5
## 6 95503 0 20 5 Eastasia 1 4 4 3 3 4 4
Please note the following coding definitions:
sex: 0 = female / 1 = male
educ: 1 = No formal education / 2 = Less than primary school education / 3 = Completed primary school / 4 = Completed secondary school / 5 = Completed high school / 6 = Completed college / 7 = Completed post-graduate degree
summary(freedata)
## X sex age educ
## Min. : 142 Min. :0.0000 Min. : 1.00 Min. :1.000
## 1st Qu.: 52723 1st Qu.:0.0000 1st Qu.:27.00 1st Qu.:1.000
## Median :108699 Median :1.0000 Median :39.00 Median :3.000
## Mean : 90665 Mean :0.5568 Mean :40.74 Mean :2.942
## 3rd Qu.:119325 3rd Qu.:1.0000 3rd Qu.:52.00 3rd Qu.:4.000
## Max. :171811 Max. :1.0000 Max. :90.00 Max. :7.000
## NA's :1 NA's :4 NA's :5
## country y v1 v2
## Eastasia:150 Min. :1.00 Min. :1.000 Min. :1.000
## Eurasia :150 1st Qu.:3.00 1st Qu.:2.000 1st Qu.:2.000
## Oceana :150 Median :4.00 Median :3.000 Median :2.000
## Mean :3.52 Mean :2.649 Mean :2.536
## 3rd Qu.:5.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.00 Max. :5.000 Max. :5.000
##
## v3 v4 v5 v6
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:4.00
## Median :4.000 Median :4.000 Median :4.000 Median :5.00
## Mean :3.664 Mean :4.084 Mean :3.867 Mean :4.38
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
##
don’t forget to check this against the summary
free_sex_med <- median(freedata$sex, na.rm = TRUE)
free_age_med <- median(freedata$age, na.rm = TRUE)
free_ed_med <- median(freedata$educ, na.rm = TRUE)
free_med <- c(free_sex_med, free_age_med, free_ed_med)
free_med
## [1] 1 39 3
don’t forget to check this against the summary
free_mean <- sapply(freedata[,2:4],mean, na.rm = TRUE)
free_mean
## sex age educ
## 0.5567929 40.7443946 2.9415730
freecut <- subset(mydata, y < 5,
select=2:5)
head(freecut)
## sex age educ country
## 1 0 20 4 Eurasia
## 2 1 25 4 Oceana
## 3 1 56 2 Eastasia
## 4 0 65 6 Eastasia
## 5 1 50 5 Oceana
## 6 0 20 5 Eastasia
names(freecut) <- c("gender", "oldness", "student_loaniness", "homeland")
summary(freecut)
## gender oldness student_loaniness homeland
## Min. :0.0000 Min. : 1.00 Min. :1.000 Eastasia:106
## 1st Qu.:0.0000 1st Qu.:27.00 1st Qu.:2.000 Eurasia : 80
## Median :1.0000 Median :38.50 Median :3.000 Oceana :130
## Mean :0.5429 Mean :41.25 Mean :3.077
## 3rd Qu.:1.0000 3rd Qu.:53.00 3rd Qu.:4.000
## Max. :1.0000 Max. :84.00 Max. :7.000
## NA's :1 NA's :2 NA's :3
fcut_mean <- sapply(freecut[,1:3], mean, na.rm = TRUE)
free_mean
## sex age educ
## 0.5567929 40.7443946 2.9415730
fcut_mean
## gender oldness student_loaniness
## 0.5428571 41.2452229 3.0766773
fcut_mean - free_mean
## gender oldness student_loaniness
## -0.01393573 0.50082831 0.13510428
Gender mean for the new dataframe is slightly more female than the original, half a year older, and slightly more educated.
fcut_med <- sapply(freecut[,1:3], median, na.rm = TRUE)
fcut_med
## gender oldness student_loaniness
## 1.0 38.5 3.0
free_med
## [1] 1 39 3
fcut_med
## gender oldness student_loaniness
## 1.0 38.5 3.0
fcut_med - free_med
## gender oldness student_loaniness
## 0.0 -0.5 0.0
As median tends to be a more stable metric, it’s not surprising that the medians for sex and education had no change. Age shifted slightly younger.
Take a look at what levels are in the homeland factor
levels(freecut$homeland)
## [1] "Eastasia" "Eurasia" "Oceana"
freecut$homeland <- as.character(freecut$homeland)
freecut$homeland[1:nrow(freecut)] <- c("East_Asia","Asiope","Nemoville")
freecut$homeland <- as.factor(freecut$homeland)
# require(plyr)
#
# revalue(freecut$homeland, c("Eastasia" = "East_Asia", "Eurasia" = "Asiope", "Oceana" = "Nemoville"))
levels(freecut$homeland)
## [1] "Asiope" "East_Asia" "Nemoville"
head(freecut)
## gender oldness student_loaniness homeland
## 1 0 20 4 East_Asia
## 2 1 25 4 Asiope
## 3 1 56 2 Nemoville
## 4 0 65 6 East_Asia
## 5 1 50 5 Asiope
## 6 0 20 5 Nemoville
library(rdrop2)
token<-drop_auth()
require(dropR)
write.csv(freecut, 'freecut.csv')
drop_upload('freecut.csv', path = "cuny_msds/bridge/R")