Q1: Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.

A1:

require(plyr)
## Loading required package: plyr
# read the dataset from the provided link
car_warns <- read.csv(url("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv"), header = TRUE)
summary(car_warns)
##        X            speed           period         warning     
##  Min.   :   1   Min.   :19.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2110   1st Qu.:33.00   1st Qu.:1.000   1st Qu.:1.000  
##  Median :4219   Median :37.00   Median :2.000   Median :2.000  
##  Mean   :4219   Mean   :37.82   Mean   :2.004   Mean   :1.507  
##  3rd Qu.:6328   3rd Qu.:42.00   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :8437   Max.   :67.00   Max.   :3.000   Max.   :2.000  
##       pair       
##  Min.   : 1.000  
##  1st Qu.: 4.000  
##  Median : 8.000  
##  Mean   : 7.559  
##  3rd Qu.:11.000  
##  Max.   :14.000
# display the first 5 rows 
head(car_warns)
##   X speed period warning pair
## 1 1    26      1       1    1
## 2 2    26      1       1    1
## 3 3    26      1       1    1
## 4 4    26      1       1    1
## 5 5    27      1       1    1
## 6 6    28      1       1    1
# show the mean speed 
mean(car_warns$speed)
## [1] 37.82423
median(car_warns$speed)
## [1] 37
#head(aggregate(warning ~ speed + period, car_warns, mean))
# show the mean/median warnings
# aggregate(cbind(speed, warning) ~ period, car_warns, each(mean, median))
mean(car_warns$warning)
## [1] 1.506697
median(car_warns$warning)
## [1] 2

Q2: Create a new data frame with a subset of the columns and rows. Make sure to rename it.

A2:

# create a new data frame with subset of the columns and rows
# new_data = car_warns[sample(1:nrow(car_warns), 10, replace=FALSE), c(1, 2:4)] #this one gives different samples each time running the chunk
new_data <- subset(car_warns, speed > 40 & warning >= 2)
head(new_data, 30)
##       X speed period warning pair
## 291 291    41      1       2    1
## 292 292    41      1       2    1
## 293 293    41      1       2    1
## 294 294    41      1       2    1
## 295 295    41      1       2    1
## 296 296    43      1       2    1
## 297 297    45      1       2    1
## 298 298    45      1       2    1
## 299 299    48      1       2    1
## 300 300    51      1       2    1
## 370 370    41      2       2    1
## 371 371    42      2       2    1
## 372 372    42      2       2    1
## 373 373    42      2       2    1
## 374 374    42      2       2    1
## 375 375    42      2       2    1
## 376 376    43      2       2    1
## 377 377    43      2       2    1
## 378 378    43      2       2    1
## 379 379    43      2       2    1
## 380 380    43      2       2    1
## 381 381    44      2       2    1
## 382 382    44      2       2    1
## 383 383    44      2       2    1
## 384 384    45      2       2    1
## 385 385    45      2       2    1
## 386 386    45      2       2    1
## 387 387    45      2       2    1
## 388 388    45      2       2    1
## 389 389    45      2       2    1
# rename it
names(new_data) <- c("record","avg.speed", "quarter", "no.warns", "PA")
head(new_data, 30)
##     record avg.speed quarter no.warns PA
## 291    291        41       1        2  1
## 292    292        41       1        2  1
## 293    293        41       1        2  1
## 294    294        41       1        2  1
## 295    295        41       1        2  1
## 296    296        43       1        2  1
## 297    297        45       1        2  1
## 298    298        45       1        2  1
## 299    299        48       1        2  1
## 300    300        51       1        2  1
## 370    370        41       2        2  1
## 371    371        42       2        2  1
## 372    372        42       2        2  1
## 373    373        42       2        2  1
## 374    374        42       2        2  1
## 375    375        42       2        2  1
## 376    376        43       2        2  1
## 377    377        43       2        2  1
## 378    378        43       2        2  1
## 379    379        43       2        2  1
## 380    380        43       2        2  1
## 381    381        44       2        2  1
## 382    382        44       2        2  1
## 383    383        44       2        2  1
## 384    384        45       2        2  1
## 385    385        45       2        2  1
## 386    386        45       2        2  1
## 387    387        45       2        2  1
## 388    388        45       2        2  1
## 389    389        45       2        2  1

Q3: Create new column names for the new data frame.

A3:

names(new_data) <- c("RC", "SP", "Q", "W", "PA")
head(new_data, 30)
##      RC SP Q W PA
## 291 291 41 1 2  1
## 292 292 41 1 2  1
## 293 293 41 1 2  1
## 294 294 41 1 2  1
## 295 295 41 1 2  1
## 296 296 43 1 2  1
## 297 297 45 1 2  1
## 298 298 45 1 2  1
## 299 299 48 1 2  1
## 300 300 51 1 2  1
## 370 370 41 2 2  1
## 371 371 42 2 2  1
## 372 372 42 2 2  1
## 373 373 42 2 2  1
## 374 374 42 2 2  1
## 375 375 42 2 2  1
## 376 376 43 2 2  1
## 377 377 43 2 2  1
## 378 378 43 2 2  1
## 379 379 43 2 2  1
## 380 380 43 2 2  1
## 381 381 44 2 2  1
## 382 382 44 2 2  1
## 383 383 44 2 2  1
## 384 384 45 2 2  1
## 385 385 45 2 2  1
## 386 386 45 2 2  1
## 387 387 45 2 2  1
## 388 388 45 2 2  1
## 389 389 45 2 2  1

Q4: Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

A4:

# use the summary()
summary(new_data)
##        RC             SP              Q               W    
##  Min.   : 291   Min.   :41.00   Min.   :1.000   Min.   :2  
##  1st Qu.:3180   1st Qu.:42.00   1st Qu.:1.000   1st Qu.:2  
##  Median :5115   Median :45.00   Median :2.000   Median :2  
##  Mean   :5059   Mean   :45.76   Mean   :2.074   Mean   :2  
##  3rd Qu.:6863   3rd Qu.:48.00   3rd Qu.:3.000   3rd Qu.:2  
##  Max.   :8437   Max.   :67.00   Max.   :3.000   Max.   :2  
##        PA        
##  Min.   : 1.000  
##  1st Qu.: 6.000  
##  Median : 9.000  
##  Mean   : 8.987  
##  3rd Qu.:13.000  
##  Max.   :14.000
# mean() for speed 
mean(new_data$SP)
## [1] 45.75581
# median() 
median(new_data$SP)
## [1] 45
# mean() and median() for no. of warns
mean(new_data$W)
## [1] 2
median(new_data$W)
## [1] 2
# Based on the output data from the taken sample, mean and median for both speed and no of warnings are higher compared to the mean/median for the whole data set. 

Q5: For at least 3 values in a column please rename so that every value in that column is renamed.

A5:

new_data$SP[new_data$SP == 41] <- "Good"
new_data$SP[new_data$SP == 45] <- "Moderate"
new_data$SP[new_data$SP == 51] <- "H.High"

Q6: Display enough rows to see examples of all of steps 1-5 above.

A6:

head(new_data, 30)
##      RC       SP Q W PA
## 291 291     Good 1 2  1
## 292 292     Good 1 2  1
## 293 293     Good 1 2  1
## 294 294     Good 1 2  1
## 295 295     Good 1 2  1
## 296 296       43 1 2  1
## 297 297 Moderate 1 2  1
## 298 298 Moderate 1 2  1
## 299 299       48 1 2  1
## 300 300   H.High 1 2  1
## 370 370     Good 2 2  1
## 371 371       42 2 2  1
## 372 372       42 2 2  1
## 373 373       42 2 2  1
## 374 374       42 2 2  1
## 375 375       42 2 2  1
## 376 376       43 2 2  1
## 377 377       43 2 2  1
## 378 378       43 2 2  1
## 379 379       43 2 2  1
## 380 380       43 2 2  1
## 381 381       44 2 2  1
## 382 382       44 2 2  1
## 383 383       44 2 2  1
## 384 384 Moderate 2 2  1
## 385 385 Moderate 2 2  1
## 386 386 Moderate 2 2  1
## 387 387 Moderate 2 2  1
## 388 388 Moderate 2 2  1
## 389 389 Moderate 2 2  1

Q7: BONUS – place the original .csv in a github file and have R read from the link.

A7:

The steps to place the dataset on your own github are the following:

  1. Go to that link to download the required dataset on your local machine dataset source.

  2. After downloading the file on your computer, add and commit it into git from your terminal then push it to your github repo.

Note that you should have git software on your local machine - you can check using git -v.

  1. Check the file and click on raw to get the link.
library(RCurl)
## Loading required package: bitops
theurl <- "https://raw.githubusercontent.com/salma71/dataset2/master/amis.csv"
df_car <- read.table(file=theurl, header=TRUE, sep = ",")
summary(df_car)
##        X            speed           period         warning     
##  Min.   :   1   Min.   :19.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2110   1st Qu.:33.00   1st Qu.:1.000   1st Qu.:1.000  
##  Median :4219   Median :37.00   Median :2.000   Median :2.000  
##  Mean   :4219   Mean   :37.82   Mean   :2.004   Mean   :1.507  
##  3rd Qu.:6328   3rd Qu.:42.00   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :8437   Max.   :67.00   Max.   :3.000   Max.   :2.000  
##       pair       
##  Min.   : 1.000  
##  1st Qu.: 4.000  
##  Median : 8.000  
##  Mean   : 7.559  
##  3rd Qu.:11.000  
##  Max.   :14.000