Data Source

Description: https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/doc/wooldridge/meap93.html

CSV: https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/wooldridge/meap93.csv

Description of Data

408 rows and 17 variables:

  1. lnchprg. perc of studs in sch lnch prog
  2. enroll. school enrollment
  3. staff. staff per 1000 students
  4. expend. expend. per stud, $
  5. salary. avg. teacher salary, $
  6. benefits. avg. teacher benefits, $
  7. droprate. school dropout rate, perc
  8. gradrate. school graduation rate, perc
  9. math10. perc studs passing MEAP math
  10. sci11. perc studs passing MEAP science
  11. totcomp. salary + benefits
  12. ltotcomp. log(totcomp)
  13. lexpend. log of expend
  14. lenroll. log(enroll)
  15. lstaff. log(staff)
  16. bensal. benefits/salary
  17. lsalary. log(salary)

Load data in CSV file to a data frame

Load data as data.frame

# set working directory
setwd("C:/Users/stina/Documents/R programming Bridge Workshop/Week 2 Assignment")
# load data onto dataframe
data.frame_meap <- read.csv("Meap93Data.csv")

Questions

(1) Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.

Summary of MEAP data

# get summary of data.frame_meap
summary(data.frame_meap)
##       row           lnchprg          enroll          staff       
##  Min.   :  1.0   Min.   : 1.40   Min.   :  212   Min.   : 65.90  
##  1st Qu.:102.8   1st Qu.:14.62   1st Qu.: 1038   1st Qu.: 91.45  
##  Median :204.5   Median :23.85   Median : 1840   Median : 99.00  
##  Mean   :204.5   Mean   :25.20   Mean   : 2664   Mean   :100.64  
##  3rd Qu.:306.2   3rd Qu.:33.83   3rd Qu.: 3085   3rd Qu.:108.03  
##  Max.   :408.0   Max.   :79.50   Max.   :16793   Max.   :166.60  
##      expend         salary         benefits        droprate     
##  Min.   :3332   Min.   :19764   Min.   :    0   Min.   : 0.000  
##  1st Qu.:3821   1st Qu.:28186   1st Qu.: 5536   1st Qu.: 1.900  
##  Median :4145   Median :31266   Median : 6304   Median : 3.700  
##  Mean   :4377   Mean   :31775   Mean   : 6463   Mean   : 5.066  
##  3rd Qu.:4659   3rd Qu.:34500   3rd Qu.: 7228   3rd Qu.: 6.500  
##  Max.   :7419   Max.   :52812   Max.   :11618   Max.   :61.900  
##     gradrate          math10          sci11          totcomp     
##  Min.   : 23.50   Min.   : 1.90   Min.   : 7.20   Min.   :24498  
##  1st Qu.: 77.00   1st Qu.:16.62   1st Qu.:41.30   1st Qu.:34032  
##  Median : 86.30   Median :23.40   Median :49.10   Median :37444  
##  Mean   : 83.65   Mean   :24.11   Mean   :49.18   Mean   :38238  
##  3rd Qu.: 93.22   3rd Qu.:30.05   3rd Qu.:57.15   3rd Qu.:41637  
##  Max.   :127.10   Max.   :66.70   Max.   :85.70   Max.   :63518  
##     ltotcomp        lexpend         lenroll          lstaff     
##  Min.   :10.11   Min.   :8.111   Min.   :5.357   Min.   :4.188  
##  1st Qu.:10.44   1st Qu.:8.248   1st Qu.:6.945   1st Qu.:4.516  
##  Median :10.53   Median :8.330   Median :7.518   Median :4.595  
##  Mean   :10.54   Mean   :8.370   Mean   :7.510   Mean   :4.603  
##  3rd Qu.:10.64   3rd Qu.:8.447   3rd Qu.:8.034   3rd Qu.:4.682  
##  Max.   :11.06   Max.   :8.912   Max.   :9.729   Max.   :5.116  
##      bensal          lsalary      
##  Min.   :0.0000   Min.   : 9.892  
##  1st Qu.:0.1880   1st Qu.:10.247  
##  Median :0.2024   Median :10.350  
##  Mean   :0.2045   Mean   :10.354  
##  3rd Qu.:0.2203   3rd Qu.:10.449  
##  Max.   :0.4500   Max.   :10.874

Mean and Median of enroll attribute

#mean of enroll
all.enroll_mean <- mean(data.frame_meap$enroll)
all.enroll_mean
## [1] 2663.806
# median of enroll 
all.enroll_median <- median(data.frame_meap$enroll)
all.enroll_median
## [1] 1840.5

Mean and Median of math10 attribute

#mean of math10
all.math10_mean <- mean(data.frame_meap$math10)
all.math10_mean
## [1] 24.10686
#median of math10
all.math10_median <- median(data.frame_meap$math10)
all.math10_median
## [1] 23.4

(2) Create a new data frame with a subset of the columns and rows. Make sure to rename it.

Subset: first 20 rows of columns lnchprg, enroll, math10, sci11

data.frame_MEAPsubset <- data.frame_meap[1:20, c(2,3, 10,11)]
data.frame_MEAPsubset
##    lnchprg enroll math10 sci11
## 1      1.4   1862   56.4  67.9
## 2      2.3  11355   42.7  65.3
## 3      2.7   7685   43.8  54.3
## 4      3.4   1148   25.3  60.0
## 5      3.4   1572   15.3  65.8
## 6      3.4   2496   46.0  60.5
## 7      3.6   3358   33.6  67.4
## 8      3.6  11983   40.1  69.4
## 9      4.2   3499   42.1  71.7
## 10     4.2   5095   39.8  55.0
## 11     4.5  16793   30.8  58.1
## 12     4.5    984   14.6  74.6
## 13     5.1   1116   51.1  48.6
## 14     5.5   4156   29.2  55.0
## 15     5.5   4046   49.7  62.0
## 16     5.6  10695   42.6  53.1
## 17     5.8   3117   35.0  54.1
## 18     6.1   2168   35.9  45.4
## 19     6.2   2317   17.8  44.5
## 20     6.2   1391   25.5  74.4

(3) Create new column names for the new data frame.

names(data.frame_MEAPsubset)[1] <- "lunch_program_rate"
names(data.frame_MEAPsubset)[2] <- "enrollment_count"
names(data.frame_MEAPsubset)[3] <- "math_pass_rate"
names(data.frame_MEAPsubset)[4] <- "science_pass_rate"
data.frame_MEAPsubset
##    lunch_program_rate enrollment_count math_pass_rate science_pass_rate
## 1                 1.4             1862           56.4              67.9
## 2                 2.3            11355           42.7              65.3
## 3                 2.7             7685           43.8              54.3
## 4                 3.4             1148           25.3              60.0
## 5                 3.4             1572           15.3              65.8
## 6                 3.4             2496           46.0              60.5
## 7                 3.6             3358           33.6              67.4
## 8                 3.6            11983           40.1              69.4
## 9                 4.2             3499           42.1              71.7
## 10                4.2             5095           39.8              55.0
## 11                4.5            16793           30.8              58.1
## 12                4.5              984           14.6              74.6
## 13                5.1             1116           51.1              48.6
## 14                5.5             4156           29.2              55.0
## 15                5.5             4046           49.7              62.0
## 16                5.6            10695           42.6              53.1
## 17                5.8             3117           35.0              54.1
## 18                6.1             2168           35.9              45.4
## 19                6.2             2317           17.8              44.5
## 20                6.2             1391           25.5              74.4

(4) Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

summary(data.frame_MEAPsubset)
##  lunch_program_rate enrollment_count math_pass_rate  science_pass_rate
##  Min.   :1.400      Min.   :  984    Min.   :14.60   Min.   :44.50    
##  1st Qu.:3.400      1st Qu.: 1790    1st Qu.:28.27   1st Qu.:54.25    
##  Median :4.350      Median : 3238    Median :37.85   Median :60.25    
##  Mean   :4.360      Mean   : 4842    Mean   :35.87   Mean   :60.35    
##  3rd Qu.:5.525      3rd Qu.: 5742    3rd Qu.:42.98   3rd Qu.:67.53    
##  Max.   :6.200      Max.   :16793    Max.   :56.40   Max.   :74.60
# mean and median of enroll

subset.enroll_mean <- mean(data.frame_MEAPsubset$enrollment_count)
# mean of enroll in subset
subset.enroll_mean
## [1] 4841.8
subset.enroll_median <- median(data.frame_MEAPsubset$enrollment_count)
# median of enroll in subset
subset.enroll_median
## [1] 3237.5
# mean and median of math10

subset.math10_mean <- mean(data.frame_MEAPsubset$math_pass_rate)
# mean of math10 in subset
subset.math10_mean
## [1] 35.865
subset.math10_median <- median(data.frame_MEAPsubset$math_pass_rate)
# median of math in subset
subset.math10_median
## [1] 37.85

Compare enroll mean and median between subset and main data

print(paste("Mean enrollment of main data is ", all.enroll_mean, " while mean enrollment of subset is ", subset.enroll_mean, "."))
## [1] "Mean enrollment of main data is  2663.80637254902  while mean enrollment of subset is  4841.8 ."
print(paste("Median enrollment of main data is ", all.enroll_median, " while median enrollment of subset is ", subset.enroll_median, "."))
## [1] "Median enrollment of main data is  1840.5  while median enrollment of subset is  3237.5 ."
print(paste("Mean math pass rate of main data is ", all.math10_mean, " while mean math pass rate of subset is ", subset.math10_mean, "."))
## [1] "Mean math pass rate of main data is  24.1068627194877  while mean math pass rate of subset is  35.864999723 ."
print(paste("Median math pass rate of main data is ", all.math10_median, "while median math pass rate of subset is ", subset.math10_median, "."))
## [1] "Median math pass rate of main data is  23.39999962 while median math pass rate of subset is  37.850000385 ."

(5) For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.

In our data, we are going to update all rows with salary that is less than 25,000 to 25,000.

library("dplyr")

# show rows of salary < 25000
salaryLess25K <- filter(data.frame_meap, data.frame_meap$salary < 25000)
salaryLess25K[, c(2,3,6,10,11)]
##    lnchprg enroll salary math10 sci11
## 1     19.0   1260  22778   10.8  65.0
## 2     20.2    740  24887    9.8  42.2
## 3     23.9    297  20951    8.3  57.3
## 4     28.3   1047  24994   16.7  60.4
## 5     31.3   1514  24554   20.6  48.3
## 6     32.5   1067  24739   17.5  39.3
## 7     32.6    471  24768   11.6  40.9
## 8     32.9   1119  24907   24.1  37.2
## 9     34.0   2311  20394   13.8  53.8
## 10    34.6    575  22242   15.4  52.5
## 11    34.8   1173  24134   20.7  51.5
## 12    35.8    757  23969   11.9  85.7
## 13    37.3   2839  24709   27.5  56.5
## 14    38.6    507  24850   10.3  49.0
## 15    38.7    434  23039   10.5  33.3
## 16    38.8   1874  24270   15.3  45.0
## 17    39.0    340  23414   13.8  42.1
## 18    42.7    212  21674   18.8  34.1
## 19    45.7    394  23700   26.8  31.7
## 20    48.5    303  24058   33.3  41.4
## 21    49.9    852  20524    4.4  44.9
## 22    52.9    688  23437   30.0  46.5
## 23    59.9    363  19764    5.9  26.5
# assign salary that is less than 25000 to 25000
data.frame_meap[data.frame_meap$salary < 25000, "salary"] <- 25000

# show rows of salary == 25000
salary25K <- filter(data.frame_meap, data.frame_meap$salary == 25000)
# only display columns lnchprg, enroll, salary, math10, and sci11
salary25K[, c(2,3,6,10,11)]
##    lnchprg enroll salary math10 sci11
## 1     19.0   1260  25000   10.8  65.0
## 2     20.2    740  25000    9.8  42.2
## 3     23.9    297  25000    8.3  57.3
## 4     28.3   1047  25000   16.7  60.4
## 5     31.3   1514  25000   20.6  48.3
## 6     32.5   1067  25000   17.5  39.3
## 7     32.6    471  25000   11.6  40.9
## 8     32.9   1119  25000   24.1  37.2
## 9     34.0   2311  25000   13.8  53.8
## 10    34.6    575  25000   15.4  52.5
## 11    34.8   1173  25000   20.7  51.5
## 12    35.8    757  25000   11.9  85.7
## 13    37.3   2839  25000   27.5  56.5
## 14    38.6    507  25000   10.3  49.0
## 15    38.7    434  25000   10.5  33.3
## 16    38.8   1874  25000   15.3  45.0
## 17    39.0    340  25000   13.8  42.1
## 18    42.7    212  25000   18.8  34.1
## 19    45.7    394  25000   26.8  31.7
## 20    48.5    303  25000   33.3  41.4
## 21    49.9    852  25000    4.4  44.9
## 22    52.9    688  25000   30.0  46.5
## 23    59.9    363  25000    5.9  26.5

(6) Display enough rows to see examples of all of steps 1-5 above.

Display first 50 rows out of 408 observations of columns lnchprg, enroll, salary, math10, sci11

data.frame_meap[1:50, c(2,3,6,10,11)]
##    lnchprg enroll salary math10 sci11
## 1      1.4   1862  37498   56.4  67.9
## 2      2.3  11355  48722   42.7  65.3
## 3      2.7   7685  44541   43.8  54.3
## 4      3.4   1148  31566   25.3  60.0
## 5      3.4   1572  29781   15.3  65.8
## 6      3.4   2496  36801   46.0  60.5
## 7      3.6   3358  37863   33.6  67.4
## 8      3.6  11983  40133   40.1  69.4
## 9      4.2   3499  36451   42.1  71.7
## 10     4.2   5095  33449   39.8  55.0
## 11     4.5  16793  40859   30.8  58.1
## 12     4.5    984  42785   14.6  74.6
## 13     5.1   1116  34085   51.1  48.6
## 14     5.5   4156  29700   29.2  55.0
## 15     5.5   4046  47436   49.7  62.0
## 16     5.6  10695  40304   42.6  53.1
## 17     5.8   3117  38873   35.0  54.1
## 18     6.1   2168  35536   35.9  45.4
## 19     6.2   2317  37350   17.8  44.5
## 20     6.2   1391  31076   25.5  74.4
## 21     6.3   3691  35538   33.2  69.2
## 22     6.3   1673  31271   39.6  67.2
## 23     6.7   2671  35547   30.3  57.1
## 24     6.8    650  34194   54.8  72.8
## 25     6.9   2119  33345   62.4  66.7
## 26     7.8   1002  25453   29.7  56.5
## 27     7.8   2549  41451   36.9  56.7
## 28     8.0   5805  33234   22.4  57.9
## 29     8.0   2822  36954   23.9  50.9
## 30     8.1   5491  34499   39.2  45.0
## 31     8.2   2331  37519   22.9  57.1
## 32     8.3   4725  39481   27.8  64.6
## 33     8.4   2629  33867   36.9  67.1
## 34     8.4   2475  34520   37.3  58.4
## 35     8.5   1278  27955   21.3  65.5
## 36     8.6   8016  36461   48.7  57.0
## 37     8.7    913  31204   34.2  45.8
## 38     8.8    891  34105   32.8  51.0
## 39     8.8   5125  34315   33.2  39.4
## 40     9.0   3263  28948   31.7  58.3
## 41     9.1    444  26682   20.0  44.0
## 42     9.1  10591  39982   28.0  39.9
## 43     9.2  13684  52812   23.4  65.4
## 44     9.3   5926  28053   42.9  69.9
## 45     9.3   2100  37112   16.8  71.0
## 46     9.4    669  36183   30.9  47.7
## 47     9.5   1789  36622   16.2  40.4
## 48     9.5   2674  45358   23.6  49.6
## 49     9.6   2405  33341   18.1  60.4
## 50     9.6    891  32197   30.5  61.8