## Please select one, download it and perform the following tasks:
url <- 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/carData/States.csv'
edu_us <- read.csv(url, header=TRUE, stringsAsFactors=FALSE)
edu_us
## X region pop SATV SATM percent dollars pay
## 1 AL ESC 4041 470 514 8 3.648 27
## 2 AK PAC 550 438 476 42 7.887 43
## 3 AZ MTN 3665 445 497 25 4.231 30
## 4 AR WSC 2351 470 511 6 3.334 23
## 5 CA PAC 29760 419 484 45 4.826 39
## 6 CO MTN 3294 456 513 28 4.809 31
## 7 CN NE 3287 430 471 74 7.914 43
## 8 DE SA 666 433 470 58 6.016 35
## 9 DC SA 607 409 441 68 8.210 39
## 10 FL SA 12938 418 466 44 5.154 30
## 11 GA SA 6478 401 443 57 4.860 29
## 12 HI PAC 1108 404 481 52 5.008 32
## 13 ID MTN 1007 466 502 17 3.200 25
## 14 IL ENC 11431 466 528 16 5.062 34
## 15 IN ENC 5544 408 459 54 5.051 32
## 16 IA WNC 2777 511 577 5 4.839 28
## 17 KS WNC 2478 492 548 10 5.009 29
## 18 KY ESC 3685 473 521 10 4.390 29
## 19 LA WSC 4220 476 517 9 4.012 26
## 20 ME NE 1228 423 463 60 5.894 28
## 21 MD SA 4781 430 478 59 6.184 38
## 22 MA NE 6016 427 473 72 6.351 36
## 23 MI ENC 9295 454 514 12 5.257 38
## 24 MN WNC 4375 477 542 14 5.260 33
## 25 MS ESC 2573 477 519 4 3.322 24
## 26 MO WNC 5117 473 522 12 4.415 28
## 27 MT MTN 799 464 523 20 5.184 26
## 28 NE WNC 1578 484 546 10 4.381 26
## 29 NV MTN 1202 434 487 24 4.564 32
## 30 NH NE 1109 442 486 67 5.504 31
## 31 NJ MA 7730 418 473 69 9.159 38
## 32 NM MTN 1515 480 527 12 4.446 26
## 33 NY MA 17990 412 470 70 8.500 42
## 34 NC SA 6629 401 440 55 4.802 29
## 35 ND WNC 639 505 564 6 3.685 23
## 36 OH ENC 10847 450 499 22 5.639 32
## 37 OK WSC 3146 478 523 9 3.742 24
## 38 OR PAC 2842 439 484 49 5.291 32
## 39 PA MA 11882 420 463 64 6.534 36
## 40 RI NE 1003 422 461 62 6.989 37
## 41 SC SA 3487 397 437 54 4.327 28
## 42 SD WNC 696 506 555 5 3.730 22
## 43 TN ESC 4877 483 525 12 3.707 28
## 44 TX WSC 16987 413 461 42 4.238 28
## 45 UT MTN 1723 492 539 5 2.993 25
## 46 VT NE 563 431 466 62 5.740 31
## 47 VA SA 6187 425 470 58 5.360 32
## 48 WA PAC 4867 437 486 44 5.045 33
## 49 WV SA 1793 443 490 15 5.046 26
## 50 WI ENC 4892 476 543 11 5.946 33
## 51 WY MTN 454 458 519 13 5.255 29
nrow(edu_us)
## [1] 51
## 1. Use the summary function to gain an overview of the data set.
## Then display the mean and median for at least two attributes
summary(edu_us)
## X region pop SATV
## Length:51 Length:51 Min. : 454 Min. :397.0
## Class :character Class :character 1st Qu.: 1215 1st Qu.:422.5
## Mode :character Mode :character Median : 3294 Median :443.0
## Mean : 4877 Mean :448.2
## 3rd Qu.: 5780 3rd Qu.:474.5
## Max. :29760 Max. :511.0
## SATM percent dollars pay
## Min. :437.0 Min. : 4.00 Min. :2.993 Min. :22.00
## 1st Qu.:470.0 1st Qu.:11.50 1st Qu.:4.354 1st Qu.:27.50
## Median :490.0 Median :25.00 Median :5.045 Median :30.00
## Mean :497.4 Mean :33.75 Mean :5.175 Mean :30.94
## 3rd Qu.:522.5 3rd Qu.:57.50 3rd Qu.:5.689 3rd Qu.:33.50
## Max. :577.0 Max. :74.00 Max. :9.159 Max. :43.00
m1 <- mean(edu_us$SATV)
m1
## [1] 448.1569
med1 <- median(edu_us$SATV)
med1
## [1] 443
m2 <- mean(edu_us$pop)
m2
## [1] 4876.647
med2 <- median(edu_us$pop)
med2
## [1] 3294
## SATV Mean :448.2
## SATV Median :443.0
## Pop Mean : 4877
## Pop Median : 3294
## 2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.
df2 <- subset(edu_us, SATV>445, select = -c(X,SATM))
df2
## region pop SATV percent dollars pay
## 1 ESC 4041 470 8 3.648 27
## 4 WSC 2351 470 6 3.334 23
## 6 MTN 3294 456 28 4.809 31
## 13 MTN 1007 466 17 3.200 25
## 14 ENC 11431 466 16 5.062 34
## 16 WNC 2777 511 5 4.839 28
## 17 WNC 2478 492 10 5.009 29
## 18 ESC 3685 473 10 4.390 29
## 19 WSC 4220 476 9 4.012 26
## 23 ENC 9295 454 12 5.257 38
## 24 WNC 4375 477 14 5.260 33
## 25 ESC 2573 477 4 3.322 24
## 26 WNC 5117 473 12 4.415 28
## 27 MTN 799 464 20 5.184 26
## 28 WNC 1578 484 10 4.381 26
## 32 MTN 1515 480 12 4.446 26
## 35 WNC 639 505 6 3.685 23
## 36 ENC 10847 450 22 5.639 32
## 37 WSC 3146 478 9 3.742 24
## 42 WNC 696 506 5 3.730 22
## 43 ESC 4877 483 12 3.707 28
## 45 MTN 1723 492 5 2.993 25
## 50 ENC 4892 476 11 5.946 33
## 51 MTN 454 458 13 5.255 29
nrow(df2)
## [1] 24
## 3. Create new column names for the new data frame.
newcolnames <- c("Reg","PopCol", "SATVCol", "PerCol", "DllsCol", "PayCol")
names(df2) <- newcolnames
df2
## Reg PopCol SATVCol PerCol DllsCol PayCol
## 1 ESC 4041 470 8 3.648 27
## 4 WSC 2351 470 6 3.334 23
## 6 MTN 3294 456 28 4.809 31
## 13 MTN 1007 466 17 3.200 25
## 14 ENC 11431 466 16 5.062 34
## 16 WNC 2777 511 5 4.839 28
## 17 WNC 2478 492 10 5.009 29
## 18 ESC 3685 473 10 4.390 29
## 19 WSC 4220 476 9 4.012 26
## 23 ENC 9295 454 12 5.257 38
## 24 WNC 4375 477 14 5.260 33
## 25 ESC 2573 477 4 3.322 24
## 26 WNC 5117 473 12 4.415 28
## 27 MTN 799 464 20 5.184 26
## 28 WNC 1578 484 10 4.381 26
## 32 MTN 1515 480 12 4.446 26
## 35 WNC 639 505 6 3.685 23
## 36 ENC 10847 450 22 5.639 32
## 37 WSC 3146 478 9 3.742 24
## 42 WNC 696 506 5 3.730 22
## 43 ESC 4877 483 12 3.707 28
## 45 MTN 1723 492 5 2.993 25
## 50 ENC 4892 476 11 5.946 33
## 51 MTN 454 458 13 5.255 29
## 4. Use the summary function to create an overview of your new data frame.
## The print the mean and median for the same two attributes. Please compare
summary(df2)
## Reg PopCol SATVCol PerCol
## Length:24 Min. : 454 Min. :450.0 Min. : 4.00
## Class :character 1st Qu.: 1562 1st Qu.:466.0 1st Qu.: 7.50
## Mode :character Median : 2962 Median :476.0 Median :10.50
## Mean : 3659 Mean :476.5 Mean :11.50
## 3rd Qu.: 4500 3rd Qu.:483.2 3rd Qu.:13.25
## Max. :11431 Max. :511.0 Max. :28.00
## DllsCol PayCol
## Min. :2.993 Min. :22.00
## 1st Qu.:3.701 1st Qu.:25.00
## Median :4.402 Median :27.50
## Mean :4.386 Mean :27.88
## 3rd Qu.:5.093 3rd Qu.:29.50
## Max. :5.946 Max. :38.00
m11 <- mean(df2$SATVCol)
m11
## [1] 476.5417
med11 <- median(df2$SATVCol)
med11
## [1] 476
m22 <- mean(df2$PopCol)
m22
## [1] 3658.75
med22 <- median(df2$PopCol)
med22
## [1] 2961.5
## SATVCol Mean :476.5
## SATVCol Median :476.0
## PopCol Mean : 3659
## PopCol Median : 2962
## For at least 3 values in a column please rename so that every value in that column is renamed.
## For example, suppose I have 20 values of the letter "e" in one column.
## Rename those values so that all 20 would show as "excellent".
df3 <- df2
df3[df3=="MTN"]<-"MTN_MTN"
df3[df3=="ENC"]<-"ENC_ENC"
df3[df3=="WNC"]<-"WNC_WNC"
df3
## Reg PopCol SATVCol PerCol DllsCol PayCol
## 1 ESC 4041 470 8 3.648 27
## 4 WSC 2351 470 6 3.334 23
## 6 MTN_MTN 3294 456 28 4.809 31
## 13 MTN_MTN 1007 466 17 3.200 25
## 14 ENC_ENC 11431 466 16 5.062 34
## 16 WNC_WNC 2777 511 5 4.839 28
## 17 WNC_WNC 2478 492 10 5.009 29
## 18 ESC 3685 473 10 4.390 29
## 19 WSC 4220 476 9 4.012 26
## 23 ENC_ENC 9295 454 12 5.257 38
## 24 WNC_WNC 4375 477 14 5.260 33
## 25 ESC 2573 477 4 3.322 24
## 26 WNC_WNC 5117 473 12 4.415 28
## 27 MTN_MTN 799 464 20 5.184 26
## 28 WNC_WNC 1578 484 10 4.381 26
## 32 MTN_MTN 1515 480 12 4.446 26
## 35 WNC_WNC 639 505 6 3.685 23
## 36 ENC_ENC 10847 450 22 5.639 32
## 37 WSC 3146 478 9 3.742 24
## 42 WNC_WNC 696 506 5 3.730 22
## 43 ESC 4877 483 12 3.707 28
## 45 MTN_MTN 1723 492 5 2.993 25
## 50 ENC_ENC 4892 476 11 5.946 33
## 51 MTN_MTN 454 458 13 5.255 29
## 6. Display enough rows to see examples of all of steps 1-5 above
df3
## Reg PopCol SATVCol PerCol DllsCol PayCol
## 1 ESC 4041 470 8 3.648 27
## 4 WSC 2351 470 6 3.334 23
## 6 MTN_MTN 3294 456 28 4.809 31
## 13 MTN_MTN 1007 466 17 3.200 25
## 14 ENC_ENC 11431 466 16 5.062 34
## 16 WNC_WNC 2777 511 5 4.839 28
## 17 WNC_WNC 2478 492 10 5.009 29
## 18 ESC 3685 473 10 4.390 29
## 19 WSC 4220 476 9 4.012 26
## 23 ENC_ENC 9295 454 12 5.257 38
## 24 WNC_WNC 4375 477 14 5.260 33
## 25 ESC 2573 477 4 3.322 24
## 26 WNC_WNC 5117 473 12 4.415 28
## 27 MTN_MTN 799 464 20 5.184 26
## 28 WNC_WNC 1578 484 10 4.381 26
## 32 MTN_MTN 1515 480 12 4.446 26
## 35 WNC_WNC 639 505 6 3.685 23
## 36 ENC_ENC 10847 450 22 5.639 32
## 37 WSC 3146 478 9 3.742 24
## 42 WNC_WNC 696 506 5 3.730 22
## 43 ESC 4877 483 12 3.707 28
## 45 MTN_MTN 1723 492 5 2.993 25
## 50 ENC_ENC 4892 476 11 5.946 33
## 51 MTN_MTN 454 458 13 5.255 29
nrow(df3)
## [1] 24
## BONUS - place the original .csv in a github file and have R read from the link.
## This will be a very useful skill as you progress in your data science education and career.
url2 <- 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/carData/States.csv'
edu_us2 <- read.csv(url, header=TRUE, stringsAsFactors=FALSE)
edu_us2
## X region pop SATV SATM percent dollars pay
## 1 AL ESC 4041 470 514 8 3.648 27
## 2 AK PAC 550 438 476 42 7.887 43
## 3 AZ MTN 3665 445 497 25 4.231 30
## 4 AR WSC 2351 470 511 6 3.334 23
## 5 CA PAC 29760 419 484 45 4.826 39
## 6 CO MTN 3294 456 513 28 4.809 31
## 7 CN NE 3287 430 471 74 7.914 43
## 8 DE SA 666 433 470 58 6.016 35
## 9 DC SA 607 409 441 68 8.210 39
## 10 FL SA 12938 418 466 44 5.154 30
## 11 GA SA 6478 401 443 57 4.860 29
## 12 HI PAC 1108 404 481 52 5.008 32
## 13 ID MTN 1007 466 502 17 3.200 25
## 14 IL ENC 11431 466 528 16 5.062 34
## 15 IN ENC 5544 408 459 54 5.051 32
## 16 IA WNC 2777 511 577 5 4.839 28
## 17 KS WNC 2478 492 548 10 5.009 29
## 18 KY ESC 3685 473 521 10 4.390 29
## 19 LA WSC 4220 476 517 9 4.012 26
## 20 ME NE 1228 423 463 60 5.894 28
## 21 MD SA 4781 430 478 59 6.184 38
## 22 MA NE 6016 427 473 72 6.351 36
## 23 MI ENC 9295 454 514 12 5.257 38
## 24 MN WNC 4375 477 542 14 5.260 33
## 25 MS ESC 2573 477 519 4 3.322 24
## 26 MO WNC 5117 473 522 12 4.415 28
## 27 MT MTN 799 464 523 20 5.184 26
## 28 NE WNC 1578 484 546 10 4.381 26
## 29 NV MTN 1202 434 487 24 4.564 32
## 30 NH NE 1109 442 486 67 5.504 31
## 31 NJ MA 7730 418 473 69 9.159 38
## 32 NM MTN 1515 480 527 12 4.446 26
## 33 NY MA 17990 412 470 70 8.500 42
## 34 NC SA 6629 401 440 55 4.802 29
## 35 ND WNC 639 505 564 6 3.685 23
## 36 OH ENC 10847 450 499 22 5.639 32
## 37 OK WSC 3146 478 523 9 3.742 24
## 38 OR PAC 2842 439 484 49 5.291 32
## 39 PA MA 11882 420 463 64 6.534 36
## 40 RI NE 1003 422 461 62 6.989 37
## 41 SC SA 3487 397 437 54 4.327 28
## 42 SD WNC 696 506 555 5 3.730 22
## 43 TN ESC 4877 483 525 12 3.707 28
## 44 TX WSC 16987 413 461 42 4.238 28
## 45 UT MTN 1723 492 539 5 2.993 25
## 46 VT NE 563 431 466 62 5.740 31
## 47 VA SA 6187 425 470 58 5.360 32
## 48 WA PAC 4867 437 486 44 5.045 33
## 49 WV SA 1793 443 490 15 5.046 26
## 50 WI ENC 4892 476 543 11 5.946 33
## 51 WY MTN 454 458 519 13 5.255 29