require(plyr)
## Loading required package: plyr
# read the dataset from the provided link
car_warns <- read.csv(url("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/boot/amis.csv"), header = TRUE)
summary(car_warns)
## X speed period warning
## Min. : 1 Min. :19.00 Min. :1.000 Min. :1.000
## 1st Qu.:2110 1st Qu.:33.00 1st Qu.:1.000 1st Qu.:1.000
## Median :4219 Median :37.00 Median :2.000 Median :2.000
## Mean :4219 Mean :37.82 Mean :2.004 Mean :1.507
## 3rd Qu.:6328 3rd Qu.:42.00 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :8437 Max. :67.00 Max. :3.000 Max. :2.000
## pair
## Min. : 1.000
## 1st Qu.: 4.000
## Median : 8.000
## Mean : 7.559
## 3rd Qu.:11.000
## Max. :14.000
# display the first 5 rows
head(car_warns)
## X speed period warning pair
## 1 1 26 1 1 1
## 2 2 26 1 1 1
## 3 3 26 1 1 1
## 4 4 26 1 1 1
## 5 5 27 1 1 1
## 6 6 28 1 1 1
# show the mean speed
mean(car_warns$speed)
## [1] 37.82423
median(car_warns$speed)
## [1] 37
#head(aggregate(warning ~ speed + period, car_warns, mean))
# show the mean/median warnings
# aggregate(cbind(speed, warning) ~ period, car_warns, each(mean, median))
mean(car_warns$warning)
## [1] 1.506697
median(car_warns$warning)
## [1] 2
# create a new data frame with subset of the columns and rows
# new_data = car_warns[sample(1:nrow(car_warns), 10, replace=FALSE), c(1, 2:4)] #this one gives different samples each time running the chunk
new_data <- subset(car_warns, speed > 40 & warning >= 2)
head(new_data, 30)
## X speed period warning pair
## 291 291 41 1 2 1
## 292 292 41 1 2 1
## 293 293 41 1 2 1
## 294 294 41 1 2 1
## 295 295 41 1 2 1
## 296 296 43 1 2 1
## 297 297 45 1 2 1
## 298 298 45 1 2 1
## 299 299 48 1 2 1
## 300 300 51 1 2 1
## 370 370 41 2 2 1
## 371 371 42 2 2 1
## 372 372 42 2 2 1
## 373 373 42 2 2 1
## 374 374 42 2 2 1
## 375 375 42 2 2 1
## 376 376 43 2 2 1
## 377 377 43 2 2 1
## 378 378 43 2 2 1
## 379 379 43 2 2 1
## 380 380 43 2 2 1
## 381 381 44 2 2 1
## 382 382 44 2 2 1
## 383 383 44 2 2 1
## 384 384 45 2 2 1
## 385 385 45 2 2 1
## 386 386 45 2 2 1
## 387 387 45 2 2 1
## 388 388 45 2 2 1
## 389 389 45 2 2 1
# rename it
names(new_data) <- c("record","avg.speed", "quarter", "no.warns", "PA")
head(new_data, 30)
## record avg.speed quarter no.warns PA
## 291 291 41 1 2 1
## 292 292 41 1 2 1
## 293 293 41 1 2 1
## 294 294 41 1 2 1
## 295 295 41 1 2 1
## 296 296 43 1 2 1
## 297 297 45 1 2 1
## 298 298 45 1 2 1
## 299 299 48 1 2 1
## 300 300 51 1 2 1
## 370 370 41 2 2 1
## 371 371 42 2 2 1
## 372 372 42 2 2 1
## 373 373 42 2 2 1
## 374 374 42 2 2 1
## 375 375 42 2 2 1
## 376 376 43 2 2 1
## 377 377 43 2 2 1
## 378 378 43 2 2 1
## 379 379 43 2 2 1
## 380 380 43 2 2 1
## 381 381 44 2 2 1
## 382 382 44 2 2 1
## 383 383 44 2 2 1
## 384 384 45 2 2 1
## 385 385 45 2 2 1
## 386 386 45 2 2 1
## 387 387 45 2 2 1
## 388 388 45 2 2 1
## 389 389 45 2 2 1
names(new_data) <- c("RC", "SP", "Q", "W", "PA")
head(new_data, 30)
## RC SP Q W PA
## 291 291 41 1 2 1
## 292 292 41 1 2 1
## 293 293 41 1 2 1
## 294 294 41 1 2 1
## 295 295 41 1 2 1
## 296 296 43 1 2 1
## 297 297 45 1 2 1
## 298 298 45 1 2 1
## 299 299 48 1 2 1
## 300 300 51 1 2 1
## 370 370 41 2 2 1
## 371 371 42 2 2 1
## 372 372 42 2 2 1
## 373 373 42 2 2 1
## 374 374 42 2 2 1
## 375 375 42 2 2 1
## 376 376 43 2 2 1
## 377 377 43 2 2 1
## 378 378 43 2 2 1
## 379 379 43 2 2 1
## 380 380 43 2 2 1
## 381 381 44 2 2 1
## 382 382 44 2 2 1
## 383 383 44 2 2 1
## 384 384 45 2 2 1
## 385 385 45 2 2 1
## 386 386 45 2 2 1
## 387 387 45 2 2 1
## 388 388 45 2 2 1
## 389 389 45 2 2 1
# use the summary()
summary(new_data)
## RC SP Q W
## Min. : 291 Min. :41.00 Min. :1.000 Min. :2
## 1st Qu.:3180 1st Qu.:42.00 1st Qu.:1.000 1st Qu.:2
## Median :5115 Median :45.00 Median :2.000 Median :2
## Mean :5059 Mean :45.76 Mean :2.074 Mean :2
## 3rd Qu.:6863 3rd Qu.:48.00 3rd Qu.:3.000 3rd Qu.:2
## Max. :8437 Max. :67.00 Max. :3.000 Max. :2
## PA
## Min. : 1.000
## 1st Qu.: 6.000
## Median : 9.000
## Mean : 8.987
## 3rd Qu.:13.000
## Max. :14.000
# mean() for speed
mean(new_data$SP)
## [1] 45.75581
# median()
median(new_data$SP)
## [1] 45
# mean() and median() for no. of warns
mean(new_data$W)
## [1] 2
median(new_data$W)
## [1] 2
# Based on the output data from the taken sample, mean and median for both speed and no of warnings are higher compared to the mean/median for the whole data set.
new_data$SP[new_data$SP == 41] <- "Good"
new_data$SP[new_data$SP == 45] <- "Moderate"
new_data$SP[new_data$SP == 51] <- "H.High"
head(new_data, 30)
## RC SP Q W PA
## 291 291 Good 1 2 1
## 292 292 Good 1 2 1
## 293 293 Good 1 2 1
## 294 294 Good 1 2 1
## 295 295 Good 1 2 1
## 296 296 43 1 2 1
## 297 297 Moderate 1 2 1
## 298 298 Moderate 1 2 1
## 299 299 48 1 2 1
## 300 300 H.High 1 2 1
## 370 370 Good 2 2 1
## 371 371 42 2 2 1
## 372 372 42 2 2 1
## 373 373 42 2 2 1
## 374 374 42 2 2 1
## 375 375 42 2 2 1
## 376 376 43 2 2 1
## 377 377 43 2 2 1
## 378 378 43 2 2 1
## 379 379 43 2 2 1
## 380 380 43 2 2 1
## 381 381 44 2 2 1
## 382 382 44 2 2 1
## 383 383 44 2 2 1
## 384 384 Moderate 2 2 1
## 385 385 Moderate 2 2 1
## 386 386 Moderate 2 2 1
## 387 387 Moderate 2 2 1
## 388 388 Moderate 2 2 1
## 389 389 Moderate 2 2 1
The steps to place the dataset on your own github are the following:
Go to that link to download the required dataset on your local machine dataset source.
After downloading the file on your computer, add and commit it into git from your terminal then push it to your github repo.
create a new folder on your desktop
“git init” to make it as a git repository.
“git add .” to add the files to git system
“git commit -m”any message you want"
“git push origin master” pushing your changes to the GitHub"
Note that you should have git software on your local machine - you can check using git -v.
library(RCurl)
## Loading required package: bitops
theurl <- "https://raw.githubusercontent.com/salma71/dataset2/master/amis.csv"
df_car <- read.table(file=theurl, header=TRUE, sep = ",")
summary(df_car)
## X speed period warning
## Min. : 1 Min. :19.00 Min. :1.000 Min. :1.000
## 1st Qu.:2110 1st Qu.:33.00 1st Qu.:1.000 1st Qu.:1.000
## Median :4219 Median :37.00 Median :2.000 Median :2.000
## Mean :4219 Mean :37.82 Mean :2.004 Mean :1.507
## 3rd Qu.:6328 3rd Qu.:42.00 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :8437 Max. :67.00 Max. :3.000 Max. :2.000
## pair
## Min. : 1.000
## 1st Qu.: 4.000
## Median : 8.000
## Mean : 7.559
## 3rd Qu.:11.000
## Max. :14.000