The library needed is mongolite
library(mongolite)
## Warning: package 'mongolite' was built under R version 4.1.3
The code below is to connect to the MongoDb client, database and collection. For this work MongoDB cloud was used
connection_string = 'mongodb+srv://esivw1eo:Family2020@cluster0.wxcci.mongodb.net/myFirstDatabase'
my_col = mongo(collection="airline_data", db= "mydatabase", url=connection_string)
Verifying the above code succefully connected to the database and collection by printing out total documents in the collection
paste("The total document in the selected collection is:",my_col$count())
## [1] "The total document in the selected collection is: 56"
Now we check the first record of the collection
my_col$iterate()$one()
## $airline
## [1] "Aer Lingus"
##
## $avail_seat_km_per_week
## [1] 320906734
##
## $incidents_85_99
## [1] "2"
##
## $fatal_accidents_85_99
## [1] 0
##
## $fatalities_85_99
## [1] "0"
##
## $incidents_00_14
## [1] "0"
##
## $fatal_accidents_00_14
## [1] "0"
##
## $fatalities_00_14
## [1] "0"
Checking the first 5 records of the collection
my_col$find(limit = 5)
## airline avail_seat_km_per_week incidents_85_99
## 1 Aer Lingus 320906734 2
## 2 Aeroflot* 1197672318 76
## 3 Aerolineas Argentinas 385803648 6
## 4 Aeromexico* 596871813 3
## 5 Air Canada 1865253802 2
## fatal_accidents_85_99 fatalities_85_99 incidents_00_14 fatal_accidents_00_14
## 1 0 0 0 0
## 2 14 128 6 1
## 3 0 0 1 0
## 4 1 64 5 0
## 5 0 0 2 0
## fatalities_00_14
## 1 0
## 2 88
## 3 0
## 4 0
## 5 0
First query gets the first 5 records where the avail_seat_kn_per_week is greater than 500,000,000
query1 <- my_col$find('{"avail_seat_km_per_week": {"$gt": 500000000}}', limit = 5)
query1
## airline avail_seat_km_per_week incidents_85_99 fatal_accidents_85_99
## 1 Aeroflot* 1197672318 76 14
## 2 Aeromexico* 596871813 3 1
## 3 Air Canada 1865253802 2 0
## 4 Air France 3004002661 14 4
## 5 Air India* 869253552 2 1
## fatalities_85_99 incidents_00_14 fatal_accidents_00_14 fatalities_00_14
## 1 128 6 1 88
## 2 64 5 0 0
## 3 0 2 0 0
## 4 79 6 2 337
## 5 329 4 1 158
The next query gets records of all airlines starting with the letter C
query2 <- my_col$find('{"airline": {"$regex": "^C"}}')
query2
## airline avail_seat_km_per_week incidents_85_99 fatal_accidents_85_99
## 1 Cathay Pacific* 2582459303 0 0
## 2 China Airlines 813216487 12 6
## 3 Condor 417982610 2 1
## 4 COPA 550491507 3 1
## fatalities_85_99 incidents_00_14 fatal_accidents_00_14 fatalities_00_14
## 1 0 2 0 0
## 2 535 2 1 225
## 3 16 0 0 0
## 4 47 0 0 0
The third query gets record of first 5 airlines that had at least 1 incidents_85_99.
Note:, since the data type of incidents_85_99 stored in the mongodb collection is chr, the value 0 was enclosed in double quotation.
query3 <- my_col$find('{"incidents_85_99": {"$gt": "0"}}', fields = '{"airline": true, "incidents_85_99": true}', limit=5)
query3
## _id airline incidents_85_99
## 1 1 Aer Lingus 2
## 2 2 Aeroflot* 76
## 3 3 Aerolineas Argentinas 6
## 4 4 Aeromexico* 3
## 5 5 Air Canada 2
df <- as.data.frame(my_col$find())
tail(df, 5)
## airline avail_seat_km_per_week incidents_85_99
## 52 United / Continental* 7139291291 19
## 53 US Airways / America West* 2455687887 16
## 54 Vietnam Airlines 625084918 7
## 55 Virgin Atlantic 1005248585 1
## 56 Xiamen Airlines 430462962 9
## fatal_accidents_85_99 fatalities_85_99 incidents_00_14 fatal_accidents_00_14
## 52 8 319 14 2
## 53 7 224 11 2
## 54 3 171 1 0
## 55 0 0 0 0
## 56 1 82 2 0
## fatalities_00_14
## 52 109
## 53 23
## 54 0
## 55 0
## 56 0
we can check the df for missing values using the code below.
which(is.na(df))
## integer(0)
From the result above, we see that there are no missing values. We can also check the total number of records with complete cases (that is all fields of that record has no missing value)
nrow(df[complete.cases(df),])
## [1] 56
We can convert some of the columns to the appropriateb data type
cols.num <- c("avail_seat_km_per_week", "incidents_85_99", "fatal_accidents_85_99", "fatalities_85_99", "incidents_00_14", "fatal_accidents_00_14", "fatalities_00_14")
df[cols.num] <- sapply(df[cols.num], as.numeric)
To confirm that the above data type change was effected, we check the class of each column.
sapply(df, class)
## airline avail_seat_km_per_week incidents_85_99
## "character" "numeric" "numeric"
## fatal_accidents_85_99 fatalities_85_99 incidents_00_14
## "numeric" "numeric" "numeric"
## fatal_accidents_00_14 fatalities_00_14
## "numeric" "numeric"
Now a summary of the dataframe
summary(df)
## airline avail_seat_km_per_week incidents_85_99
## Length:56 Min. :2.594e+08 Min. : 0.000
## Class :character 1st Qu.:4.740e+08 1st Qu.: 2.000
## Mode :character Median :8.029e+08 Median : 4.000
## Mean :1.385e+09 Mean : 7.179
## 3rd Qu.:1.847e+09 3rd Qu.: 8.000
## Max. :7.139e+09 Max. :76.000
## fatal_accidents_85_99 fatalities_85_99 incidents_00_14 fatal_accidents_00_14
## Min. : 0.000 Min. : 0.0 Min. : 0.000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0 1st Qu.: 1.000 1st Qu.:0.0000
## Median : 1.000 Median : 48.5 Median : 3.000 Median :0.0000
## Mean : 2.179 Mean :112.4 Mean : 4.125 Mean :0.6607
## 3rd Qu.: 3.000 3rd Qu.:184.2 3rd Qu.: 5.250 3rd Qu.:1.0000
## Max. :14.000 Max. :535.0 Max. :24.000 Max. :3.0000
## fatalities_00_14
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 55.52
## 3rd Qu.: 83.25
## Max. :537.00