One of the challenges in working with data is wrangling. In this assignment we will use R to perform this task. Here is a list of data sets: http://vincentarelbundock.github.io/Rdatasets/ (click on the csv index for a list)
List = https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv
Please select one, download it and perform the following tasks:
csvurl <- "https://vincentarelbundock.github.io/Rdatasets/csv/DAAG/poxetc.csv"
# poxetc has data on "Deaths from various causes, in London from 1629 till 1881, with gaps".
poxdata <- read.table(file=csvurl, header=TRUE, sep=",")
summary(poxdata)
## X fpox measles all
## Min. : 1 Min. : 38 Min. : 1.00 Min. : 8427
## 1st Qu.: 64 1st Qu.: 722 1st Qu.: 64.25 1st Qu.:19278
## Median :127 Median :1231 Median : 212.00 Median :21305
## Mean :127 Mean :1406 Mean : 490.15 Mean :28968
## 3rd Qu.:190 3rd Qu.:1961 3rd Qu.: 714.00 3rd Qu.:26282
## Max. :253 Max. :7912 Max. :2788.00 Max. :97306
## NA's :19 NA's :33 NA's :19
## fpox2all measles2all
## Min. : 0.56 Min. : 0.050
## 1st Qu.: 27.73 1st Qu.: 3.143
## Median : 56.23 Median : 9.600
## Mean : 59.73 Mean :14.395
## 3rd Qu.: 89.78 3rd Qu.:22.170
## Max. :183.94 Max. :93.000
## NA's :19 NA's :33
#Ignoring the NA values
fpox_mean <-round(mean(poxdata$fpox, na.rm = TRUE), digits = 0)
measles_mean <- round(mean(poxdata$measles, na.rm = TRUE), digits = 0)
sprintf("The mean values for Pox and Measles are %s and %s respectively",fpox_mean, measles_mean)
## [1] "The mean values for Pox and Measles are 1406 and 490 respectively"
fpox_med <-round(median(poxdata$fpox, na.rm = TRUE), digits = 0)
measles_med <- round(median(poxdata$measles, na.rm = TRUE), digits = 0)
sprintf("The median values for Pox and Measles are %s and %s respectively",fpox_med, measles_med)
## [1] "The median values for Pox and Measles are 1231 and 212 respectively"
poxframe <- data.frame(poxdata,stringsAsFactors=FALSE)
# Only want to see where there are no missing values.
# poxframenew <- subset(poxframe, fpox != 'NA' & measles != 'NA') # This will remove NA values, but I want more;
# There are significantly more cases of fpox than measles, I want to see where it's the other way- more measles than pox
poxframenew <- subset(poxframe, fpox != 'NA' & measles != 'NA' & measles >= fpox)
poxframenew
## X fpox measles all fpox2all measles2all
## 180 180 1169 1386 19954 58.58 69.45
## 186 186 638 817 19283 33.08 42.36
## 188 188 653 1106 20316 32.14 54.43
## 190 190 421 728 19705 21.36 36.94
## 193 193 508 547 18451 27.53 29.64
## 194 194 604 712 18865 32.01 37.74
## 196 196 725 966 20237 35.82 17.73
## 198 198 503 774 20758 24.23 37.28
## 200 200 598 736 21709 27.54 33.90
## 203 203 563 750 25337 22.22 29.60
## 214 214 360 1293 45400 7.92 28.48
## 215 215 438 1442 48718 8.99 29.60
## 217 217 909 2318 48318 18.81 47.97
## 218 218 257 747 49450 5.10 15.10
## 219 219 955 1778 59131 16.15 30.06
## 221 221 521 1154 68755 7.57 16.78
## 222 222 499 980 48950 10.19 20.02
## 223 223 1062 1297 55488 19.13 23.37
## 225 225 211 978 60069 3.51 16.28
## 226 226 694 1409 73697 9.41 19.11
## 228 228 531 1479 57274 9.27 25.82
## 229 229 156 1341 59103 2.63 22.68
## 230 230 242 2369 64093 3.77 36.96
## 231 231 1158 1330 61860 18.55 21.50
## 232 232 898 2090 62309 14.41 33.54
## 233 233 217 1062 65251 3.32 16.27
## 234 234 366 2334 67371 5.43 34.64
## 236 236 547 2788 78238 6.99 35.63
## 237 237 640 1290 73531 8.70 17.53
## 238 238 1391 2220 80453 17.28 27.59
## 240 240 597 1962 73798 8.09 26.59
## 241 241 275 1456 78082 3.52 18.65
## 242 242 973 1449 77634 12.53 18.66
## 245 245 113 2149 75459 1.49 28.47
## 246 246 57 1680 76813 0.74 21.87
## 247 247 46 1408 81964 0.56 17.17
## 248 248 736 1720 77671 9.47 22.14
## 250 250 1417 1500 84188 16.83 17.81
## 251 251 450 2475 83805 5.36 29.53
## 252 252 475 1501 81128 5.85 18.50
## 253 253 2371 2533 81071 29.24 31.25
# There is only 41 instances of there being more measles than pox, out of 220 rows.
names(poxframenew)
## [1] "X" "fpox" "measles" "all" "fpox2all"
## [6] "measles2all"
colnames(poxframenew) <- c("Index","Pox","Measles","Total Deaths","Pox Deaths/ All","Measles Deaths/ All")
# While I'm unsure of the original significance of the last 2 columns, I know their value is derived by dividing the 'fpox'/'measles' columns by the 'all' column (Total Deaths), and multiplying by 1000. Perhaps these numbers stand for 1000's of deaths.
poxframenew
## Index Pox Measles Total Deaths Pox Deaths/ All Measles Deaths/ All
## 180 180 1169 1386 19954 58.58 69.45
## 186 186 638 817 19283 33.08 42.36
## 188 188 653 1106 20316 32.14 54.43
## 190 190 421 728 19705 21.36 36.94
## 193 193 508 547 18451 27.53 29.64
## 194 194 604 712 18865 32.01 37.74
## 196 196 725 966 20237 35.82 17.73
## 198 198 503 774 20758 24.23 37.28
## 200 200 598 736 21709 27.54 33.90
## 203 203 563 750 25337 22.22 29.60
## 214 214 360 1293 45400 7.92 28.48
## 215 215 438 1442 48718 8.99 29.60
## 217 217 909 2318 48318 18.81 47.97
## 218 218 257 747 49450 5.10 15.10
## 219 219 955 1778 59131 16.15 30.06
## 221 221 521 1154 68755 7.57 16.78
## 222 222 499 980 48950 10.19 20.02
## 223 223 1062 1297 55488 19.13 23.37
## 225 225 211 978 60069 3.51 16.28
## 226 226 694 1409 73697 9.41 19.11
## 228 228 531 1479 57274 9.27 25.82
## 229 229 156 1341 59103 2.63 22.68
## 230 230 242 2369 64093 3.77 36.96
## 231 231 1158 1330 61860 18.55 21.50
## 232 232 898 2090 62309 14.41 33.54
## 233 233 217 1062 65251 3.32 16.27
## 234 234 366 2334 67371 5.43 34.64
## 236 236 547 2788 78238 6.99 35.63
## 237 237 640 1290 73531 8.70 17.53
## 238 238 1391 2220 80453 17.28 27.59
## 240 240 597 1962 73798 8.09 26.59
## 241 241 275 1456 78082 3.52 18.65
## 242 242 973 1449 77634 12.53 18.66
## 245 245 113 2149 75459 1.49 28.47
## 246 246 57 1680 76813 0.74 21.87
## 247 247 46 1408 81964 0.56 17.17
## 248 248 736 1720 77671 9.47 22.14
## 250 250 1417 1500 84188 16.83 17.81
## 251 251 450 2475 83805 5.36 29.53
## 252 252 475 1501 81128 5.85 18.50
## 253 253 2371 2533 81071 29.24 31.25
summary(poxframenew)
## Index Pox Measles Total Deaths
## Min. :180.0 Min. : 46.0 Min. : 547 Min. :18451
## 1st Qu.:214.0 1st Qu.: 366.0 1st Qu.: 980 1st Qu.:45400
## Median :228.0 Median : 547.0 Median :1408 Median :61860
## Mean :223.7 Mean : 632.8 Mean :1465 Mean :56187
## 3rd Qu.:240.0 3rd Qu.: 736.0 3rd Qu.:1778 3rd Qu.:76813
## Max. :253.0 Max. :2371.0 Max. :2788 Max. :84188
## Pox Deaths/ All Measles Deaths/ All
## Min. : 0.56 Min. :15.10
## 1st Qu.: 5.43 1st Qu.:18.66
## Median : 9.47 Median :27.59
## Mean :14.76 Mean :28.26
## 3rd Qu.:21.36 3rd Qu.:33.90
## Max. :58.58 Max. :69.45
fpox_mean2 <-round(mean(poxframenew$Pox), digits = 0)
measles_mean2 <- round(mean(poxframenew$Measles), digits = 0)
sprintf("The mean values for Pox and Measles are %s and %s respectively",fpox_mean2, measles_mean2)
## [1] "The mean values for Pox and Measles are 633 and 1465 respectively"
fpox_med2 <-round(median(poxframenew$Pox), digits = 0)
measles_med2 <- round(median(poxframenew$Measles), digits = 0)
sprintf("The median values for Pox and Measles are %s and %s respectively",fpox_med2, measles_med2)
## [1] "The median values for Pox and Measles are 547 and 1408 respectively"
# The mean values for Pox and Measles were originally 1406 and 490, and became 633 and 1465 respectively.
# The median values for Pox and Measles were originally 1231 and 212, and became 547 and 1408 respectively.
# The values for Pox changed, with the mean and median values for Pox dropping significantly. The opposite is true for Measles, with the mean and median values rising significantly. This is because of my choice in sub-setting the original dataframe to only show me cases where there were more deaths caused by Measles, which led to me only including 41 rows of data from the filtered 220 rows. These 41 rows served to increase the mean/median values for Measles, and decrease the mean/median values for Pox.
# For this, I will break the numbers up into category. Measles Deaths column values range from 547 to 2788. When 1<=X<1000 is Mild, 1000<=X<2000 is Strong, 2000<=X is Severe.
# I will make a new dataframe so as to not rewrite and make accidental changes to prior dataframe
poxframenew2 <- data.frame(poxframenew,stringsAsFactors=FALSE)
poxframenew2["Measles"][poxframenew2["Measles"] >= 1 & poxframenew2["Measles"] < 1000] <- "Mild"
poxframenew2["Measles"][poxframenew2["Measles"] >= 1000 & poxframenew2["Measles"] < 2000] <- "Strong"
poxframenew2["Measles"][poxframenew2["Measles"] >= 2000 & poxframenew2["Measles"] < 9999] <- "Severe"
poxframenew2
## Index Pox Measles Total.Deaths Pox.Deaths..All Measles.Deaths..All
## 180 180 1169 Strong 19954 58.58 69.45
## 186 186 638 Mild 19283 33.08 42.36
## 188 188 653 Strong 20316 32.14 54.43
## 190 190 421 Mild 19705 21.36 36.94
## 193 193 508 Mild 18451 27.53 29.64
## 194 194 604 Mild 18865 32.01 37.74
## 196 196 725 Mild 20237 35.82 17.73
## 198 198 503 Mild 20758 24.23 37.28
## 200 200 598 Mild 21709 27.54 33.90
## 203 203 563 Mild 25337 22.22 29.60
## 214 214 360 Strong 45400 7.92 28.48
## 215 215 438 Strong 48718 8.99 29.60
## 217 217 909 Severe 48318 18.81 47.97
## 218 218 257 Mild 49450 5.10 15.10
## 219 219 955 Strong 59131 16.15 30.06
## 221 221 521 Strong 68755 7.57 16.78
## 222 222 499 Mild 48950 10.19 20.02
## 223 223 1062 Strong 55488 19.13 23.37
## 225 225 211 Mild 60069 3.51 16.28
## 226 226 694 Strong 73697 9.41 19.11
## 228 228 531 Strong 57274 9.27 25.82
## 229 229 156 Strong 59103 2.63 22.68
## 230 230 242 Severe 64093 3.77 36.96
## 231 231 1158 Strong 61860 18.55 21.50
## 232 232 898 Severe 62309 14.41 33.54
## 233 233 217 Strong 65251 3.32 16.27
## 234 234 366 Severe 67371 5.43 34.64
## 236 236 547 Severe 78238 6.99 35.63
## 237 237 640 Strong 73531 8.70 17.53
## 238 238 1391 Severe 80453 17.28 27.59
## 240 240 597 Strong 73798 8.09 26.59
## 241 241 275 Strong 78082 3.52 18.65
## 242 242 973 Strong 77634 12.53 18.66
## 245 245 113 Severe 75459 1.49 28.47
## 246 246 57 Strong 76813 0.74 21.87
## 247 247 46 Strong 81964 0.56 17.17
## 248 248 736 Strong 77671 9.47 22.14
## 250 250 1417 Strong 84188 16.83 17.81
## 251 251 450 Severe 83805 5.36 29.53
## 252 252 475 Strong 81128 5.85 18.50
## 253 253 2371 Severe 81071 29.24 31.25
print(head(poxframenew2, 20))
## Index Pox Measles Total.Deaths Pox.Deaths..All Measles.Deaths..All
## 180 180 1169 Strong 19954 58.58 69.45
## 186 186 638 Mild 19283 33.08 42.36
## 188 188 653 Strong 20316 32.14 54.43
## 190 190 421 Mild 19705 21.36 36.94
## 193 193 508 Mild 18451 27.53 29.64
## 194 194 604 Mild 18865 32.01 37.74
## 196 196 725 Mild 20237 35.82 17.73
## 198 198 503 Mild 20758 24.23 37.28
## 200 200 598 Mild 21709 27.54 33.90
## 203 203 563 Mild 25337 22.22 29.60
## 214 214 360 Strong 45400 7.92 28.48
## 215 215 438 Strong 48718 8.99 29.60
## 217 217 909 Severe 48318 18.81 47.97
## 218 218 257 Mild 49450 5.10 15.10
## 219 219 955 Strong 59131 16.15 30.06
## 221 221 521 Strong 68755 7.57 16.78
## 222 222 499 Mild 48950 10.19 20.02
## 223 223 1062 Strong 55488 19.13 23.37
## 225 225 211 Mild 60069 3.51 16.28
## 226 226 694 Strong 73697 9.41 19.11
# This will show we have the renamed Dataframe, with Mild, Strong, and Severe as values for the 'Pox' column.
library (readr)
mygiturl <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/poxetc.csv"
mydata<-read_csv(url(mygiturl))
## New names:
## Rows: 253 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (6): ...1, fpox, measles, all, fpox2all, measles2all
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
mydata
## # A tibble: 253 × 6
## ...1 fpox measles all fpox2all measles2all
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 72 42 8814 8.17 4.47
## 2 2 40 2 10471 3.82 0.19
## 3 3 58 3 8458 6.85 0.35
## 4 4 531 80 9539 55.7 8.38
## 5 5 72 21 8427 8.54 2.49
## 6 6 1354 33 10865 125. 3.03
## 7 7 293 27 10641 27.5 2.53
## 8 8 127 12 23382 5.48 0.51
## 9 9 NA NA NA NA NA
## 10 10 NA NA NA NA NA
## # ℹ 243 more rows