Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.
Originally pulled the file from this
link in the csv.index
provided. I uploaded a copy of the file to GitHub and am reading from
there, as part of the Bonus.
source_url <- "https://vincentarelbundock.github.io/Rdatasets/csv/AER/PSID1976.csv"
github_url <- "https://raw.githubusercontent.com/andrewbowen19/rBridgeCUNY/main/data/labor-force-participation.csv"
# Pulling file from our github repo
df <- read.csv(github_url, sep=",", header=TRUE)
# Pulling file from "source" (csv.index)
# df <- read.table("https://vincentarelbundock.github.io/Rdatasets/csv/AER/PSID1976.csv",sep=",", header=TRUE)
# Printing summary of our df
print(summary.data.frame(df))
## X participation hours youngkids
## Min. : 1 Length:753 Min. : 0.0 Min. :0.0000
## 1st Qu.:189 Class :character 1st Qu.: 0.0 1st Qu.:0.0000
## Median :377 Mode :character Median : 288.0 Median :0.0000
## Mean :377 Mean : 740.6 Mean :0.2377
## 3rd Qu.:565 3rd Qu.:1516.0 3rd Qu.:0.0000
## Max. :753 Max. :4950.0 Max. :3.0000
## oldkids age education wage
## Min. :0.000 Min. :30.00 Min. : 5.00 Min. : 0.000
## 1st Qu.:0.000 1st Qu.:36.00 1st Qu.:12.00 1st Qu.: 0.000
## Median :1.000 Median :43.00 Median :12.00 Median : 1.625
## Mean :1.353 Mean :42.54 Mean :12.29 Mean : 2.375
## 3rd Qu.:2.000 3rd Qu.:49.00 3rd Qu.:13.00 3rd Qu.: 3.788
## Max. :8.000 Max. :60.00 Max. :17.00 Max. :25.000
## repwage hhours hage heducation
## Min. :0.00 Min. : 175 Min. :30.00 Min. : 3.00
## 1st Qu.:0.00 1st Qu.:1928 1st Qu.:38.00 1st Qu.:11.00
## Median :0.00 Median :2164 Median :46.00 Median :12.00
## Mean :1.85 Mean :2267 Mean :45.12 Mean :12.49
## 3rd Qu.:3.58 3rd Qu.:2553 3rd Qu.:52.00 3rd Qu.:15.00
## Max. :9.98 Max. :5010 Max. :60.00 Max. :17.00
## hwage fincome tax meducation
## Min. : 0.4121 Min. : 1500 Min. :0.4415 Min. : 0.000
## 1st Qu.: 4.7883 1st Qu.:15428 1st Qu.:0.6215 1st Qu.: 7.000
## Median : 6.9758 Median :20880 Median :0.6915 Median :10.000
## Mean : 7.4822 Mean :23081 Mean :0.6789 Mean : 9.251
## 3rd Qu.: 9.1667 3rd Qu.:28200 3rd Qu.:0.7215 3rd Qu.:12.000
## Max. :40.5090 Max. :96000 Max. :0.9415 Max. :17.000
## feducation unemp city experience
## Min. : 0.000 Min. : 3.000 Length:753 Min. : 0.00
## 1st Qu.: 7.000 1st Qu.: 7.500 Class :character 1st Qu.: 4.00
## Median : 7.000 Median : 7.500 Mode :character Median : 9.00
## Mean : 8.809 Mean : 8.624 Mean :10.63
## 3rd Qu.:12.000 3rd Qu.:11.000 3rd Qu.:15.00
## Max. :17.000 Max. :14.000 Max. :45.00
## college hcollege
## Length:753 Length:753
## Class :character Class :character
## Mode :character Mode :character
##
##
##
head(df, 5)
## X participation hours youngkids oldkids age education wage repwage hhours
## 1 1 yes 1610 1 0 32 12 3.3540 2.65 2708
## 2 2 yes 1656 0 2 30 12 1.3889 2.65 2310
## 3 3 yes 1980 1 3 35 12 4.5455 4.04 3072
## 4 4 yes 456 0 3 34 12 1.0965 3.25 1920
## 5 5 yes 1568 1 2 31 14 4.5918 3.60 2000
## hage heducation hwage fincome tax meducation feducation unemp city
## 1 34 12 4.0288 16310 0.7215 12 7 5.0 no
## 2 30 9 8.4416 21800 0.6615 7 7 11.0 yes
## 3 40 12 3.5807 21040 0.6915 12 7 5.0 no
## 4 53 10 3.5417 7300 0.7815 7 7 5.0 no
## 5 32 12 10.0000 27300 0.6215 12 14 9.5 yes
## experience college hcollege
## 1 14 no no
## 2 5 no no
## 3 15 no no
## 4 6 no no
## 5 7 yes no
data.frame attributesprint(mean(df$wage))
## [1] 2.374565
print(median(df$wage))
## [1] 1.625
print(mean(df$hours))
## [1] 740.5764
print(median(df$hours))
## [1] 288
Create a new data frame with a subset of the columns and rows. Make sure to rename it.
# subset of original dataframe
newDF <- df[1:100, c("participation", "age", "wage", "hours", "college", "city")]
nrow(newDF)
## [1] 100
Create new column names for the new dataframe
# Re-naming columns
colnames(newDF)[1] <- "participate"
colnames(newDF)[2] <- "worker_age"
colnames(newDF)[3] <- "worker_wage"
colnames(newDF)[4] <- "hours_worked"
colnames(newDF)[5] <- "attended_college"
colnames(newDF)[6] <- "city_of_residence"
print(newDF)
## participate worker_age worker_wage hours_worked attended_college
## 1 yes 32 3.3540 1610 no
## 2 yes 30 1.3889 1656 no
## 3 yes 35 4.5455 1980 no
## 4 yes 34 1.0965 456 no
## 5 yes 31 4.5918 1568 yes
## 6 yes 54 4.7421 2032 no
## 7 yes 37 8.3333 1440 yes
## 8 yes 54 7.8431 1020 no
## 9 yes 48 2.1262 1458 no
## 10 yes 39 4.6875 1600 no
## 11 yes 33 4.0630 1969 no
## 12 yes 42 4.5918 1960 no
## 13 yes 30 2.0833 240 no
## 14 yes 43 2.2668 997 no
## 15 yes 43 3.6797 1848 no
## 16 yes 35 1.3472 1224 no
## 17 yes 43 3.2143 1400 no
## 18 yes 39 5.1750 640 no
## 19 yes 45 2.0000 2000 no
## 20 yes 35 7.5529 1324 no
## 21 yes 42 3.5052 2215 yes
## 22 yes 30 3.5714 1680 no
## 23 yes 48 3.2500 1600 yes
## 24 yes 45 3.2500 800 no
## 25 yes 31 2.1545 1955 no
## 26 yes 43 3.7879 660 yes
## 27 yes 59 4.0000 525 no
## 28 yes 32 4.7269 1904 no
## 29 yes 31 7.2559 1516 yes
## 30 yes 42 5.8671 346 no
## 31 yes 50 1.5385 1040 no
## 32 yes 59 2.4590 732 yes
## 33 yes 36 5.8511 1880 yes
## 34 yes 51 3.5714 1680 no
## 35 yes 45 3.8068 2081 yes
## 36 yes 42 2.4638 690 no
## 37 yes 46 2.3753 4210 no
## 38 yes 46 4.5351 2205 no
## 39 yes 51 5.6183 1952 yes
## 40 yes 30 14.6310 1302 yes
## 41 yes 30 2.6786 112 no
## 42 yes 57 3.9194 893 no
## 43 yes 31 2.5729 583 yes
## 44 yes 48 4.5375 480 no
## 45 yes 30 2.0000 1900 no
## 46 yes 34 3.4722 576 no
## 47 yes 48 2.0161 2056 yes
## 48 yes 45 4.5716 1984 no
## 49 yes 51 2.2727 2640 no
## 50 yes 30 2.6375 240 no
## 51 yes 46 2.2899 1173 no
## 52 yes 58 1.0989 3640 no
## 53 yes 37 1.1765 340 no
## 54 yes 52 1.6000 500 no
## 55 yes 52 1.8762 1599 no
## 56 yes 31 4.0437 1830 yes
## 57 yes 55 9.6354 1920 yes
## 58 yes 34 8.0409 2052 yes
## 59 yes 55 4.5990 2312 yes
## 60 yes 39 2.1429 196 no
## 61 yes 40 4.4000 2500 yes
## 62 yes 43 3.5354 1980 no
## 63 yes 48 2.7174 1840 no
## 64 yes 47 6.2500 320 no
## 65 yes 41 11.9330 419 no
## 66 yes 36 3.5931 1880 no
## 67 yes 46 6.9444 72 yes
## 68 yes 34 2.9167 120 no
## 69 yes 41 3.0769 1885 no
## 70 yes 51 3.7500 240 no
## 71 yes 33 5.7259 1729 no
## 72 yes 52 3.6757 1850 no
## 73 yes 58 5.1648 2033 no
## 74 yes 34 8.2237 608 no
## 75 yes 31 4.3365 1153 no
## 76 yes 48 4.9819 2208 no
## 77 yes 32 0.3571 252 no
## 78 yes 49 2.9674 337 yes
## 79 yes 32 1.0000 90 yes
## 80 yes 58 2.5554 1174 no
## 81 yes 50 0.8602 372 no
## 82 yes 60 1.0000 30 yes
## 83 yes 50 2.9261 1800 no
## 84 yes 56 3.5461 282 yes
## 85 yes 51 1.6264 720 no
## 86 yes 54 8.3333 1440 yes
## 87 yes 59 3.0952 2100 yes
## 88 yes 46 2.7000 1000 no
## 89 yes 46 5.2521 952 yes
## 90 yes 39 1.4154 1413 no
## 91 yes 44 4.7986 2100 no
## 92 yes 33 1.6667 120 no
## 93 yes 33 1.1217 3000 no
## 94 yes 48 0.5000 1000 no
## 95 yes 31 0.7143 336 no
## 96 yes 45 2.7961 1216 no
## 97 yes 45 4.8583 988 no
## 98 yes 32 1.7435 2581 yes
## 99 yes 47 2.4631 2030 no
## 100 yes 34 2.4213 413 yes
## city_of_residence
## 1 no
## 2 yes
## 3 no
## 4 no
## 5 yes
## 6 yes
## 7 no
## 8 no
## 9 no
## 10 no
## 11 no
## 12 no
## 13 no
## 14 yes
## 15 yes
## 16 yes
## 17 yes
## 18 yes
## 19 yes
## 20 no
## 21 no
## 22 no
## 23 yes
## 24 no
## 25 yes
## 26 yes
## 27 yes
## 28 yes
## 29 no
## 30 no
## 31 yes
## 32 yes
## 33 no
## 34 yes
## 35 yes
## 36 no
## 37 yes
## 38 no
## 39 no
## 40 yes
## 41 no
## 42 yes
## 43 yes
## 44 no
## 45 no
## 46 no
## 47 yes
## 48 no
## 49 no
## 50 no
## 51 yes
## 52 yes
## 53 no
## 54 yes
## 55 no
## 56 yes
## 57 yes
## 58 yes
## 59 yes
## 60 yes
## 61 no
## 62 yes
## 63 yes
## 64 yes
## 65 yes
## 66 no
## 67 no
## 68 yes
## 69 yes
## 70 no
## 71 yes
## 72 yes
## 73 yes
## 74 yes
## 75 yes
## 76 yes
## 77 no
## 78 yes
## 79 yes
## 80 yes
## 81 yes
## 82 no
## 83 yes
## 84 yes
## 85 no
## 86 yes
## 87 yes
## 88 yes
## 89 yes
## 90 no
## 91 yes
## 92 yes
## 93 yes
## 94 yes
## 95 no
## 96 no
## 97 yes
## 98 yes
## 99 no
## 100 no
Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.
summary.data.frame(newDF)
## participate worker_age worker_wage hours_worked
## Length:100 Min. :30.00 Min. : 0.3571 Min. : 30.0
## Class :character 1st Qu.:34.00 1st Qu.: 2.1516 1st Qu.: 581.2
## Mode :character Median :43.00 Median : 3.4131 Median :1426.5
## Mean :42.61 Mean : 3.7603 Mean :1330.0
## 3rd Qu.:49.25 3rd Qu.: 4.6211 3rd Qu.:1952.8
## Max. :60.00 Max. :14.6310 Max. :4210.0
## attended_college city_of_residence
## Length:100 Length:100
## Class :character Class :character
## Mode :character Mode :character
##
##
##
print(mean(newDF$worker_wage))
## [1] 3.760277
print(median(newDF$worker_wage))
## [1] 3.4131
print(mean(newDF$hours_worked))
## [1] 1330.04
print(median(newDF$hours_worked))
## [1] 1426.5
Given our subset of the first 100 rows, our mean and median increased from the same attributes before (wage & hours).
For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.
Want to map our boolean column (“participate”, “attended_college”) to TRUE/FALSE based on yes/no response
# Using ifelse function to map yes -> TRUE and no -> FALSE
newDF$participate <- ifelse(newDF$participate=="yes", FALSE, TRUE)
newDF$attended_college <- ifelse(newDF$attended_college=="yes", FALSE, TRUE)
# Renaming city column to LA or NYC (boolean-ish value)
newDF$city_of_residence <- ifelse(newDF$city_of_residence=="yes", "LA", "NYC")
head(newDF, 10)
## participate worker_age worker_wage hours_worked attended_college
## 1 FALSE 32 3.3540 1610 TRUE
## 2 FALSE 30 1.3889 1656 TRUE
## 3 FALSE 35 4.5455 1980 TRUE
## 4 FALSE 34 1.0965 456 TRUE
## 5 FALSE 31 4.5918 1568 FALSE
## 6 FALSE 54 4.7421 2032 TRUE
## 7 FALSE 37 8.3333 1440 FALSE
## 8 FALSE 54 7.8431 1020 TRUE
## 9 FALSE 48 2.1262 1458 TRUE
## 10 FALSE 39 4.6875 1600 TRUE
## city_of_residence
## 1 NYC
## 2 LA
## 3 NYC
## 4 NYC
## 5 LA
## 6 LA
## 7 NYC
## 8 NYC
## 9 NYC
## 10 NYC
Display enough rows to see examples of all of steps 1-5 above.
head(newDF, 20)
## participate worker_age worker_wage hours_worked attended_college
## 1 FALSE 32 3.3540 1610 TRUE
## 2 FALSE 30 1.3889 1656 TRUE
## 3 FALSE 35 4.5455 1980 TRUE
## 4 FALSE 34 1.0965 456 TRUE
## 5 FALSE 31 4.5918 1568 FALSE
## 6 FALSE 54 4.7421 2032 TRUE
## 7 FALSE 37 8.3333 1440 FALSE
## 8 FALSE 54 7.8431 1020 TRUE
## 9 FALSE 48 2.1262 1458 TRUE
## 10 FALSE 39 4.6875 1600 TRUE
## 11 FALSE 33 4.0630 1969 TRUE
## 12 FALSE 42 4.5918 1960 TRUE
## 13 FALSE 30 2.0833 240 TRUE
## 14 FALSE 43 2.2668 997 TRUE
## 15 FALSE 43 3.6797 1848 TRUE
## 16 FALSE 35 1.3472 1224 TRUE
## 17 FALSE 43 3.2143 1400 TRUE
## 18 FALSE 39 5.1750 640 TRUE
## 19 FALSE 45 2.0000 2000 TRUE
## 20 FALSE 35 7.5529 1324 TRUE
## city_of_residence
## 1 NYC
## 2 LA
## 3 NYC
## 4 NYC
## 5 LA
## 6 LA
## 7 NYC
## 8 NYC
## 9 NYC
## 10 NYC
## 11 NYC
## 12 NYC
## 13 NYC
## 14 LA
## 15 LA
## 16 LA
## 17 LA
## 18 LA
## 19 LA
## 20 NYC