One of the challenges in working with data is wrangling. In this assignment we will use R to perform this task.
For this exercise, I am using the California Test Score Data from http://vincentarelbundock.github.io/Rdatasets/, which I have uploaded to the MSDS-R-Bridge repository on my GitHub
url <- "https://raw.githubusercontent.com/josh1den/MSDS-R-Bridge/main/CASchools.csv"
newfile <- read.csv(url)
head(newfile)
## X district school county grades students teachers
## 1 1 75119 Sunol Glen Unified Alameda KK-08 195 10.90
## 2 2 61499 Manzanita Elementary Butte KK-08 240 11.15
## 3 3 61549 Thermalito Union Elementary Butte KK-08 1550 82.90
## 4 4 61457 Golden Feather Union Elementary Butte KK-08 243 14.00
## 5 5 61523 Palermo Union Elementary Butte KK-08 1335 71.50
## 6 6 62042 Burrel Union Elementary Fresno KK-08 137 6.40
## calworks lunch computer expenditure income english read math
## 1 0.5102 2.0408 67 6384.911 22.690001 0.000000 691.6 690.0
## 2 15.4167 47.9167 101 5099.381 9.824000 4.583333 660.5 661.9
## 3 55.0323 76.3226 169 5501.955 8.978000 30.000002 636.3 650.9
## 4 36.4754 77.0492 85 7101.831 8.978000 0.000000 651.9 643.5
## 5 33.1086 78.4270 171 5235.988 9.080333 13.857677 641.8 639.9
## 6 12.3188 86.9565 25 5580.147 10.415000 12.408759 605.7 605.4
Summary:
# use the summary function to gain an overview of the data set
summary(newfile)
## X district school county
## Min. : 1.0 Min. :61382 Length:420 Length:420
## 1st Qu.:105.8 1st Qu.:64308 Class :character Class :character
## Median :210.5 Median :67760 Mode :character Mode :character
## Mean :210.5 Mean :67473
## 3rd Qu.:315.2 3rd Qu.:70419
## Max. :420.0 Max. :75440
## grades students teachers calworks
## Length:420 Min. : 81.0 Min. : 4.85 Min. : 0.000
## Class :character 1st Qu.: 379.0 1st Qu.: 19.66 1st Qu.: 4.395
## Mode :character Median : 950.5 Median : 48.56 Median :10.520
## Mean : 2628.8 Mean : 129.07 Mean :13.246
## 3rd Qu.: 3008.0 3rd Qu.: 146.35 3rd Qu.:18.981
## Max. :27176.0 Max. :1429.00 Max. :78.994
## lunch computer expenditure income
## Min. : 0.00 Min. : 0.0 Min. :3926 Min. : 5.335
## 1st Qu.: 23.28 1st Qu.: 46.0 1st Qu.:4906 1st Qu.:10.639
## Median : 41.75 Median : 117.5 Median :5215 Median :13.728
## Mean : 44.71 Mean : 303.4 Mean :5312 Mean :15.317
## 3rd Qu.: 66.86 3rd Qu.: 375.2 3rd Qu.:5601 3rd Qu.:17.629
## Max. :100.00 Max. :3324.0 Max. :7712 Max. :55.328
## english read math
## Min. : 0.000 Min. :604.5 Min. :605.4
## 1st Qu.: 1.941 1st Qu.:640.4 1st Qu.:639.4
## Median : 8.778 Median :655.8 Median :652.5
## Mean :15.768 Mean :655.0 Mean :653.3
## 3rd Qu.:22.970 3rd Qu.:668.7 3rd Qu.:665.9
## Max. :85.540 Max. :704.0 Max. :709.5
Mean and Median of Reading and Math Columns:
# display the mean and the median for at least two attributes
meanmed <- function(column) {
#returns the mean and median of a single column in a dataframe
avg <- round(mean(column), 2)
med <- round(median(column), 2)
print(paste("mean:", avg))
print(paste("median", med))
}
print("Reading Score:")
## [1] "Reading Score:"
meanmed(newfile$read)
## [1] "mean: 654.97"
## [1] "median 655.75"
print("Math Score:")
## [1] "Math Score:"
meanmed(newfile$math)
## [1] "mean: 653.34"
## [1] "median 652.45"
# select desired columns
cols <- c("school", "county", "students", "read", "math")
newdf <- newfile[cols]
# subset the new dataframe for schools above the median student size
med_student <- median(newfile$students)
newdf <- subset(newdf, students > med_student)
head(newdf)
## school county students read math
## 3 Thermalito Union Elementary Butte 1550 636.3 650.9
## 5 Palermo Union Elementary Butte 1335 641.8 639.9
## 10 Del Paso Heights Elementary Sacramento 2247 611.9 613.4
## 12 West Fresno Elementary Fresno 987 616.6 616.0
## 19 Lennox Elementary Los Angeles 6880 619.1 620.5
## 20 Lamont Elementary Kern 2688 621.3 619.3
# rename the columns
colnames(newdf) <- c("School", "County", "Students", "Reading", "Math")
colnames(newdf)
## [1] "School" "County" "Students" "Reading" "Math"
Summary:
# compare summary of the new dataframe
summary(newdf)
## School County Students Reading
## Length:210 Length:210 Min. : 953 Min. :611.9
## Class :character Class :character 1st Qu.: 1899 1st Qu.:637.1
## Mode :character Mode :character Median : 3011 Median :653.6
## Mean : 4850 Mean :653.0
## 3rd Qu.: 6268 3rd Qu.:666.6
## Max. :27176 Max. :704.0
## Math
## Min. :613.4
## 1st Qu.:637.9
## Median :652.0
## Mean :653.0
## 3rd Qu.:665.1
## Max. :709.5
Mean and Median of Reading and Math Columns:
# display the mean and median scores
print("Reading Score:")
## [1] "Reading Score:"
meanmed(newdf$Reading)
## [1] "mean: 653.02"
## [1] "median 653.6"
print("Math Score:")
## [1] "Math Score:"
meanmed(newdf$Math)
## [1] "mean: 653.01"
## [1] "median 651.95"
We can observe that both the median and mean reading and math scores are lower for schools above the median school size than for all schools.
Remove “Elementary” from all school names
# remove Elementary from every school name
newdf$School <- gsub(" Elementary", "",as.character(newdf$School))
Add “County” to “Orange”, “Kings”, and “Humboldt” counties
# add "County" to Orange, Kings, and Humboldt
newdf$County <- gsub("Orange", "Orange County", as.character(newdf$County))
newdf$County <- gsub("Kings", "Kings County", as.character(newdf$County))
newdf$County <- gsub("Humboldt", "Humboldt County", as.character(newdf$County))
Display first 20 rows of original dataframe
head(newfile, 20)
## X district school county grades students
## 1 1 75119 Sunol Glen Unified Alameda KK-08 195
## 2 2 61499 Manzanita Elementary Butte KK-08 240
## 3 3 61549 Thermalito Union Elementary Butte KK-08 1550
## 4 4 61457 Golden Feather Union Elementary Butte KK-08 243
## 5 5 61523 Palermo Union Elementary Butte KK-08 1335
## 6 6 62042 Burrel Union Elementary Fresno KK-08 137
## 7 7 68536 Holt Union Elementary San Joaquin KK-08 195
## 8 8 63834 Vineland Elementary Kern KK-08 888
## 9 9 62331 Orange Center Elementary Fresno KK-08 379
## 10 10 67306 Del Paso Heights Elementary Sacramento KK-06 2247
## 11 11 65722 Le Grand Union Elementary Merced KK-08 446
## 12 12 62174 West Fresno Elementary Fresno KK-08 987
## 13 13 71795 Allensworth Elementary Tulare KK-08 103
## 14 14 72181 Sunnyside Union Elementary Tulare KK-08 487
## 15 15 72298 Woodville Elementary Tulare KK-08 649
## 16 16 72041 Pixley Union Elementary Tulare KK-08 852
## 17 17 63594 Lost Hills Union Elementary Kern KK-08 491
## 18 18 63370 Buttonwillow Union Elementary Kern KK-08 421
## 19 19 64709 Lennox Elementary Los Angeles KK-08 6880
## 20 20 63560 Lamont Elementary Kern KK-08 2688
## teachers calworks lunch computer expenditure income english read
## 1 10.90 0.5102 2.0408 67 6384.911 22.690001 0.000000 691.6
## 2 11.15 15.4167 47.9167 101 5099.381 9.824000 4.583333 660.5
## 3 82.90 55.0323 76.3226 169 5501.955 8.978000 30.000002 636.3
## 4 14.00 36.4754 77.0492 85 7101.831 8.978000 0.000000 651.9
## 5 71.50 33.1086 78.4270 171 5235.988 9.080333 13.857677 641.8
## 6 6.40 12.3188 86.9565 25 5580.147 10.415000 12.408759 605.7
## 7 10.00 12.9032 94.6237 28 5253.331 6.577000 68.717949 604.5
## 8 42.50 18.8063 100.0000 66 4565.746 8.174000 46.959461 605.5
## 9 19.00 32.1900 93.1398 35 5355.548 7.385000 30.079157 608.9
## 10 108.00 78.9942 87.3164 0 5036.211 11.613333 40.275921 611.9
## 11 21.00 18.6099 85.8744 86 4547.692 8.931000 52.914799 612.8
## 12 47.00 71.7131 98.6056 56 5447.345 7.385000 54.609932 616.6
## 13 5.00 22.4299 98.1308 25 6567.149 5.335000 42.718445 612.8
## 14 24.34 24.6094 77.1484 0 4818.613 8.279000 20.533880 610.0
## 15 36.00 14.6379 76.2712 31 5621.456 9.630000 80.123260 611.9
## 16 42.07 24.2142 94.2957 80 6026.360 7.454000 49.413143 614.8
## 17 28.92 11.2016 97.7597 100 6723.238 6.216000 85.539719 611.7
## 18 25.50 8.5511 77.9097 50 5589.885 7.764000 58.907364 614.9
## 19 303.03 21.2824 94.9712 960 5064.616 7.022000 77.005814 619.1
## 20 135.00 23.4375 93.2292 139 5433.593 5.699000 49.813988 621.3
## math
## 1 690.0
## 2 661.9
## 3 650.9
## 4 643.5
## 5 639.9
## 6 605.4
## 7 609.0
## 8 612.5
## 9 616.1
## 10 613.4
## 11 618.7
## 12 616.0
## 13 619.8
## 14 622.6
## 15 621.0
## 16 619.9
## 17 624.4
## 18 621.7
## 19 620.5
## 20 619.3
Display only rows with “Orange”, “Kings”, and “Humboldt” counties:
cols <- c("Orange", "Kings", "Humboldt")
head(newfile[newfile$county %in% cols,], 40)
## X district school county grades students
## 63 63 66423 Anaheim Elementary Orange KK-06 20927
## 64 64 63974 Lemoore Union Elementary Kings KK-08 3017
## 65 65 63875 Armona Union Elementary Kings KK-08 957
## 68 68 63917 Hanford Elementary Kings KK-08 5079
## 73 73 63966 Lakeside Union Elementary Kings KK-08 499
## 78 78 63008 Rio Dell Elementary Humboldt KK-08 329
## 114 114 66589 Magnolia Elementary Orange KK-06 6518
## 144 144 62737 Cuddeback Union Elementary Humboldt KK-08 149
## 145 145 62703 Blue Lake Union Elementary Humboldt KK-08 220
## 148 148 62984 Peninsula Union Elementary Humboldt KK-08 133
## 156 156 66456 Buena Park Elementary Orange KK-08 5620
## 157 157 66746 Westminster Elementary Orange KK-08 9775
## 158 158 63933 Island Union Elementary Kings KK-08 246
## 170 170 66696 Savanna Elementary Orange KK-06 2409
## 176 176 63032 South Bay Union Elementary Humboldt KK-06 575
## 184 184 62802 Fortuna Union Elementary Humboldt KK-08 780
## 191 191 66506 Fullerton Elementary Orange KK-08 12567
## 194 194 63016 Rohnerville Elementary Humboldt KK-08 577
## 206 206 62927 Loleta Union Elementary Humboldt KK-08 168
## 221 221 63958 Kit Carson Union Elementary Kings KK-08 412
## 231 231 63883 Central Union Elementary Kings KK-08 1789
## 239 239 63057 Trinidad Union Elementary Humboldt KK-08 160
## 243 243 66472 Centralia Elementary Orange KK-06 5205
## 246 246 63024 Scotia Union Elementary Humboldt KK-08 370
## 284 284 62745 Cutten Elementary Humboldt KK-06 515
## 292 292 62679 Arcata Elementary Humboldt KK-08 966
## 296 296 66613 Ocean View Elementary Orange KK-08 9850
## 305 305 66530 Huntington Beach City Elementary Orange KK-08 6601
## 311 311 63990 Pioneer Union Elementary Kings KK-08 992
## 320 320 62828 Freshwater Elementary Humboldt KK-06 245
## 325 325 63941 Kings River-Hardwick Union Elementary Kings KK-08 576
## 330 330 66480 Cypress Elementary Orange KK-06 4734
## 336 336 62794 Fieldbrook Elementary Humboldt KK-08 129
## 341 341 66498 Fountain Valley Elementary Orange KK-08 6257
## 346 346 62885 Hydesville Elementary Humboldt KK-08 162
## 366 366 62976 Pacific Union Elementary Humboldt KK-08 594
## 374 374 62893 Jacoby Creek Elementary Humboldt KK-08 383
## teachers calworks lunch computer expenditure income english read
## 63 953.50 10.9114 82.3926 1048 4969.181 13.40063 58.2166557 629.6
## 64 138.50 14.9768 56.0636 496 4675.675 11.08100 17.0036469 634.4
## 65 50.00 27.1682 82.1317 149 5306.133 9.08200 17.6593533 634.7
## 68 224.50 25.5593 63.8036 585 5228.651 11.11600 16.6174450 637.7
## 73 30.00 40.3670 83.1193 94 5367.090 11.11600 22.8456917 635.5
## 78 18.53 28.9157 85.5422 37 5462.899 10.62900 6.0790277 644.3
## 114 332.10 12.2791 71.0238 499 5172.020 14.24290 43.4949379 638.5
## 144 8.50 19.4631 59.0604 30 5182.428 10.33300 0.0000000 656.8
## 145 13.39 19.1304 52.1739 48 5960.464 11.97200 0.0000000 658.0
## 148 7.55 25.0000 72.1429 25 5811.690 13.33500 0.0000000 644.3
## 156 267.00 10.3095 55.0939 316 5002.839 15.05138 36.1743774 644.0
## 157 484.50 13.0430 62.5039 1218 5337.625 15.41318 44.9718666 645.6
## 158 11.50 9.7561 43.9024 38 5161.202 11.08100 16.6666679 647.8
## 170 108.66 8.1515 52.1202 354 5100.976 14.25857 27.5217915 646.0
## 176 35.10 26.4348 59.1304 59 5504.025 14.22800 8.1739130 654.6
## 184 41.15 16.5394 50.7634 75 5261.371 14.19700 3.8461540 662.0
## 191 546.05 7.6231 40.3995 1099 4603.660 17.82300 27.5403824 651.6
## 194 29.95 20.3361 46.0504 134 4882.237 14.19700 0.0000000 657.2
## 206 11.01 29.1667 63.6905 30 5653.454 12.50200 12.5000000 655.9
## 221 20.10 8.7379 60.9223 72 5078.500 11.11600 9.4660196 653.0
## 231 96.00 2.9625 56.0089 435 7070.633 10.55667 9.5584126 655.4
## 239 8.70 16.0920 36.7816 37 6168.229 14.07400 0.0000000 672.8
## 243 242.50 8.3189 37.2911 474 4954.466 15.74992 22.9394817 655.1
## 246 19.80 6.5041 36.3144 56 4385.229 12.17400 0.5405406 664.0
## 284 27.35 16.1165 26.2136 58 5190.310 14.20900 0.0000000 660.2
## 292 47.96 20.9581 47.3054 114 5318.241 11.83400 0.6211180 674.0
## 296 489.30 3.3350 30.6437 1113 5081.456 20.87575 18.5583763 663.7
## 305 305.00 1.1968 15.0886 706 4747.296 25.03000 6.9080443 668.0
## 311 46.97 3.7298 22.8831 160 4615.909 11.11600 5.1411290 671.2
## 320 12.55 8.9796 13.0612 30 6732.066 15.57200 0.0000000 672.2
## 325 31.00 3.9931 28.1250 92 4907.393 11.11600 9.5486116 665.8
## 330 232.75 2.6078 19.3725 586 4922.940 18.82723 10.8576260 665.5
## 336 6.60 13.9535 27.1318 17 5195.919 11.83400 0.0000000 670.1
## 341 288.63 2.1635 15.5128 930 4889.480 21.09575 8.7262268 671.2
## 346 8.40 16.9591 33.9181 23 5210.725 10.64300 0.0000000 679.8
## 366 33.18 12.2689 25.3782 75 5371.173 11.83400 1.6835017 683.0
## 374 20.45 6.0052 10.1828 49 5351.523 15.38100 0.0000000 682.9
## math
## 63 636.7
## 64 632.9
## 65 633.1
## 68 630.5
## 73 633.6
## 78 626.1
## 114 643.0
## 144 633.7
## 145 633.1
## 148 647.2
## 156 649.4
## 157 648.2
## 158 646.1
## 170 652.3
## 176 646.5
## 184 640.8
## 191 652.6
## 194 647.5
## 206 651.8
## 221 657.7
## 231 657.9
## 239 642.2
## 243 660.5
## 246 652.7
## 284 666.8
## 292 654.6
## 296 665.8
## 305 664.0
## 311 661.9
## 320 662.7
## 325 671.0
## 330 672.4
## 336 669.5
## 341 671.3
## 346 663.6
## 366 668.4
## 374 673.2
Display first 20 rows of new dataframe
head(newdf, 20)
## School County Students Reading Math
## 3 Thermalito Union Butte 1550 636.3 650.9
## 5 Palermo Union Butte 1335 641.8 639.9
## 10 Del Paso Heights Sacramento 2247 611.9 613.4
## 12 West Fresno Fresno 987 616.6 616.0
## 19 Lennox Los Angeles 6880 619.1 620.5
## 20 Lamont Kern 2688 621.3 619.3
## 23 Wasco Union Kern 2538 622.9 620.6
## 25 Livingston Union Merced 2357 619.5 625.7
## 26 Woodlake Union Tulare 1588 625.0 621.2
## 27 Alisal Union Monterey 7306 620.4 626.0
## 28 Arvin Union Kern 2601 616.5 630.4
## 31 San Ysidro San Diego 4142 620.4 628.7
## 32 Soledad Unified Monterey 2102 623.0 626.9
## 33 Mountain View Los Angeles 10012 620.8 629.8
## 34 King City Union Monterey 2488 626.1 625.6
## 35 Ontario-Montclair San Bernardino 25151 625.4 626.8
## 36 Los Nietos Los Angeles 2267 625.4 628.2
## 37 Winton Merced 1657 623.6 630.2
## 39 Ravenswood City San Mateo 5370 624.4 630.1
## 40 Richland-Lerdo Union Kern 2471 627.5 627.1
Display only rows with “Orange County”, “Kings County”, and “Humboldt County” in County column:
newcols <- c("Orange County", "Kings County", "Humboldt County")
head(newdf[newdf$County %in% newcols,], 20)
## School County Students Reading Math
## 63 Anaheim Orange County 20927 629.6 636.7
## 64 Lemoore Union Kings County 3017 634.4 632.9
## 65 Armona Union Kings County 957 634.7 633.1
## 68 Hanford Kings County 5079 637.7 630.5
## 114 Magnolia Orange County 6518 638.5 643.0
## 156 Buena Park Orange County 5620 644.0 649.4
## 157 Westminster Orange County 9775 645.6 648.2
## 170 Savanna Orange County 2409 646.0 652.3
## 191 Fullerton Orange County 12567 651.6 652.6
## 231 Central Union Kings County 1789 655.4 657.9
## 243 Centralia Orange County 5205 655.1 660.5
## 292 Arcata Humboldt County 966 674.0 654.6
## 296 Ocean View Orange County 9850 663.7 665.8
## 305 Huntington Beach City Orange County 6601 668.0 664.0
## 311 Pioneer Union Kings County 992 671.2 661.9
## 330 Cypress Orange County 4734 665.5 672.4
## 341 Fountain Valley Orange County 6257 671.2 671.3