tidyverse package.if(!require(tidyverse)) install.packages("tidyverse")
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyverse)
.txt file from your desktop.wages <- read_csv("C:/Users/djpri/Desktop/CUNY/Bridge_R/week2/wages_education.txt")
## Parsed with column specification:
## cols(
## X1 = col_double(),
## nr = col_double(),
## year = col_double(),
## school = col_double(),
## exper = col_double(),
## union = col_character(),
## ethn = col_character(),
## maried = col_character(),
## health = col_character(),
## wage = col_double(),
## industry = col_character(),
## occupation = col_character(),
## residence = col_character()
## )
.csv file from its URL.wages_url <- "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/Ecdat/Males.csv"
wages_web <- read.csv(wages_url)
head(wages)
## # A tibble: 6 x 13
## X1 nr year school exper union ethn maried health wage industry
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 1 13 1980 14 1 no other no no 1.20 Busines~
## 2 2 13 1981 14 2 yes other no no 1.85 Persona~
## 3 3 13 1982 14 3 no other no no 1.34 Busines~
## 4 4 13 1983 14 4 no other no no 1.43 Busines~
## 5 5 13 1984 14 5 no other no no 1.57 Persona~
## 6 6 13 1985 14 6 no other no no 1.70 Busines~
## # ... with 2 more variables: occupation <chr>, residence <chr>
tail(wages)
## # A tibble: 6 x 13
## X1 nr year school exper union ethn maried health wage industry
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 4355 12548 1982 9 7 no other no no 0.832 Constru~
## 2 4356 12548 1983 9 8 no other yes no 1.59 Constru~
## 3 4357 12548 1984 9 9 yes other yes no 1.21 Constru~
## 4 4358 12548 1985 9 10 no other yes no 1.77 Constru~
## 5 4359 12548 1986 9 11 yes other yes no 1.75 Profess~
## 6 4360 12548 1987 9 12 yes other yes no 1.47 Public_~
## # ... with 2 more variables: occupation <chr>, residence <chr>
head(wages_web)
## X nr year school exper union ethn maried health wage
## 1 1 13 1980 14 1 no other no no 1.197540
## 2 2 13 1981 14 2 yes other no no 1.853060
## 3 3 13 1982 14 3 no other no no 1.344462
## 4 4 13 1983 14 4 no other no no 1.433213
## 5 5 13 1984 14 5 no other no no 1.568125
## 6 6 13 1985 14 6 no other no no 1.699891
## industry occupation
## 1 Business_and_Repair_Service Service_Workers
## 2 Personal_Service Service_Workers
## 3 Business_and_Repair_Service Service_Workers
## 4 Business_and_Repair_Service Service_Workers
## 5 Personal_Service Craftsmen, Foremen_and_kindred
## 6 Business_and_Repair_Service Managers, Officials_and_Proprietors
## residence
## 1 north_east
## 2 north_east
## 3 north_east
## 4 north_east
## 5 north_east
## 6 north_east
tail(wages_web)
## X nr year school exper union ethn maried health wage
## 4355 4355 12548 1982 9 7 no other no no 0.8324815
## 4356 4356 12548 1983 9 8 no other yes no 1.5918787
## 4357 4357 12548 1984 9 9 yes other yes no 1.2125428
## 4358 4358 12548 1985 9 10 no other yes no 1.7659618
## 4359 4359 12548 1986 9 11 yes other yes no 1.7458942
## 4360 4360 12548 1987 9 12 yes other yes no 1.4665429
## industry occupation
## 4355 Construction Craftsmen, Foremen_and_kindred
## 4356 Construction Craftsmen, Foremen_and_kindred
## 4357 Construction Craftsmen, Foremen_and_kindred
## 4358 Construction Craftsmen, Foremen_and_kindred
## 4359 Professional_and_Related Service Craftsmen, Foremen_and_kindred
## 4360 Public_Administration Craftsmen, Foremen_and_kindred
## residence
## 4355 <NA>
## 4356 <NA>
## 4357 <NA>
## 4358 <NA>
## 4359 <NA>
## 4360 <NA>
summary() function to gain an overview of the data set.mean() and median() for at least two attributes.*The mean() is a measure of central tendency along with the median() and mode().
*The mean() is obtained by dividing the the sum of all values by the number of values in the dataset.
*The median() is the value of the middle term in a data set that has been ranked in increasing order.
*The mode() is the value that occurs with the highest frequency in a data set.
summary() function to the wages data.summary(wages)
## X1 nr year school
## Min. : 1 Min. : 13 Min. :1980 Min. : 3.00
## 1st Qu.:1091 1st Qu.: 2329 1st Qu.:1982 1st Qu.:11.00
## Median :2180 Median : 4569 Median :1984 Median :12.00
## Mean :2180 Mean : 5262 Mean :1984 Mean :11.77
## 3rd Qu.:3270 3rd Qu.: 8406 3rd Qu.:1985 3rd Qu.:12.00
## Max. :4360 Max. :12548 Max. :1987 Max. :16.00
## exper union ethn maried
## Min. : 0.000 Length:4360 Length:4360 Length:4360
## 1st Qu.: 4.000 Class :character Class :character Class :character
## Median : 6.000 Mode :character Mode :character Mode :character
## Mean : 6.515
## 3rd Qu.: 9.000
## Max. :18.000
## health wage industry occupation
## Length:4360 Min. :-3.579 Length:4360 Length:4360
## Class :character 1st Qu.: 1.351 Class :character Class :character
## Mode :character Median : 1.671 Mode :character Mode :character
## Mean : 1.649
## 3rd Qu.: 1.991
## Max. : 4.052
## residence
## Length:4360
## Class :character
## Mode :character
##
##
##
mean() and median() for three attributes:#mean() & median() for year
mean(wages$year)
## [1] 1983.5
median(wages$year)
## [1] 1983.5
#mean() & median() years of schooling
mean(wages$school)
## [1] 11.76697
median(wages$school)
## [1] 12
#mean() & median() log of hourly wages
mean(wages$wage)
## [1] 1.649147
median(wages$wage)
## [1] 1.671143
tribble using the tidyverse’s dplyr package.wages_trib <- tribble(
~measure, ~year, ~school, ~wage,
#-------/-------/---------/---------
"mean", mean(wages$year), mean(wages$school), mean(wages$wage),
"median", median(wages$year), median(wages$school), median(wages$wage)
)
print(wages_trib)
## # A tibble: 2 x 4
## measure year school wage
## <chr> <dbl> <dbl> <dbl>
## 1 mean 1984. 11.8 1.65
## 2 median 1984. 12 1.67
data.frame() with a subset of the columns and rows. Make sure to rename it.subset.data.frame() function to accomplish this task.wages_df <- subset.data.frame(wages,
year >= 1983 &
school >= 12 &
union == "yes" &
wage >= 1.6,
c('year',
'school',
'union',
'wage'))
data.frame().head(wages_df)
## # A tibble: 6 x 4
## year school union wage
## <dbl> <dbl> <chr> <dbl>
## 1 1983 12 yes 1.75
## 2 1983 12 yes 2.19
## 3 1983 12 yes 1.65
## 4 1985 12 yes 1.83
## 5 1986 12 yes 1.88
## 6 1987 12 yes 2.02
data.frame().rename() function with the wages_trib tribble todata.frame().wages_df <- rename(wages_trib,
Year=year,
School=school,
Wage=wage
)
head(wages_df)
## # A tibble: 2 x 4
## measure Year School Wage
## <chr> <dbl> <dbl> <dbl>
## 1 mean 1984. 11.8 1.65
## 2 median 1984. 12 1.67
tail(wages_df)
## # A tibble: 2 x 4
## measure Year School Wage
## <chr> <dbl> <dbl> <dbl>
## 1 mean 1984. 11.8 1.65
## 2 median 1984. 12 1.67
summary() function to create an overview of your new data.frame().summary(wages_df)
## measure Year School Wage
## Length:2 Min. :1984 Min. :11.77 Min. :1.649
## Class :character 1st Qu.:1984 1st Qu.:11.83 1st Qu.:1.655
## Mode :character Median :1984 Median :11.88 Median :1.660
## Mean :1984 Mean :11.88 Mean :1.660
## 3rd Qu.:1984 3rd Qu.:11.94 3rd Qu.:1.666
## Max. :1984 Max. :12.00 Max. :1.671
mean() and median() for the same two attributes.#Year: mean() & median()
mean(wages$year)
## [1] 1983.5
median(wages$year)
## [1] 1983.5
#Wage: mean() & median()
mean(wages$wage)
## [1] 1.649147
median(wages$wage)
## [1] 1.671143
tribble for ease of view.mm1 <- tribble(
~measure, ~Year, ~Wage,
#------/------/-------
"mean", mean(wages$year), mean(wages$wage),
"median", median(wages$year), median(wages$wage)
)
tribble.print(mm1)
## # A tibble: 2 x 3
## measure Year Wage
## <chr> <dbl> <dbl>
## 1 mean 1984. 1.65
## 2 median 1984. 1.67
yes responsesyes to Union_Member andno responses in the union column from no to Non_Union_Member.For Loop along with an if-else statement to execute the changes.i <- 1
for (row in wages$union){
if (row == "yes"){
wages[[i,'union']] <- "Union_Member" #[[x]]=retrieve components
}else if (row == "no"){
wages[[i,'union']] <- "Non_Union_Member"
}
i <- i + 1
}
head() & tail() functions to view the firstdata.frame().head(wages)
## # A tibble: 6 x 13
## X1 nr year school exper union ethn maried health wage industry
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 1 13 1980 14 1 Non_~ other no no 1.20 Busines~
## 2 2 13 1981 14 2 Unio~ other no no 1.85 Persona~
## 3 3 13 1982 14 3 Non_~ other no no 1.34 Busines~
## 4 4 13 1983 14 4 Non_~ other no no 1.43 Busines~
## 5 5 13 1984 14 5 Non_~ other no no 1.57 Persona~
## 6 6 13 1985 14 6 Non_~ other no no 1.70 Busines~
## # ... with 2 more variables: occupation <chr>, residence <chr>
tail(wages)
## # A tibble: 6 x 13
## X1 nr year school exper union ethn maried health wage industry
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 4355 12548 1982 9 7 Non_~ other no no 0.832 Constru~
## 2 4356 12548 1983 9 8 Non_~ other yes no 1.59 Constru~
## 3 4357 12548 1984 9 9 Unio~ other yes no 1.21 Constru~
## 4 4358 12548 1985 9 10 Non_~ other yes no 1.77 Constru~
## 5 4359 12548 1986 9 11 Unio~ other yes no 1.75 Profess~
## 6 4360 12548 1987 9 12 Unio~ other yes no 1.47 Public_~
## # ... with 2 more variables: occupation <chr>, residence <chr>