Connecting MongoDB collection with R

The library needed is mongolite

library(mongolite)
## Warning: package 'mongolite' was built under R version 4.1.3

The code below is to connect to the MongoDb client, database and collection. For this work MongoDB cloud was used

connection_string = 'mongodb+srv://esivw1eo:Family2020@cluster0.wxcci.mongodb.net/myFirstDatabase'

my_col = mongo(collection="airline_data", db= "mydatabase", url=connection_string)

Verifying the above code succefully connected to the database and collection by printing out total documents in the collection

paste("The total document in the selected collection is:",my_col$count())
## [1] "The total document in the selected collection is: 56"

Now we check the first record of the collection

my_col$iterate()$one()
## $airline
## [1] "Aer Lingus"
## 
## $avail_seat_km_per_week
## [1] 320906734
## 
## $incidents_85_99
## [1] "2"
## 
## $fatal_accidents_85_99
## [1] 0
## 
## $fatalities_85_99
## [1] "0"
## 
## $incidents_00_14
## [1] "0"
## 
## $fatal_accidents_00_14
## [1] "0"
## 
## $fatalities_00_14
## [1] "0"

Checking the first 5 records of the collection

my_col$find(limit = 5)
##                 airline avail_seat_km_per_week incidents_85_99
## 1            Aer Lingus              320906734               2
## 2             Aeroflot*             1197672318              76
## 3 Aerolineas Argentinas              385803648               6
## 4           Aeromexico*              596871813               3
## 5            Air Canada             1865253802               2
##   fatal_accidents_85_99 fatalities_85_99 incidents_00_14 fatal_accidents_00_14
## 1                     0                0               0                     0
## 2                    14              128               6                     1
## 3                     0                0               1                     0
## 4                     1               64               5                     0
## 5                     0                0               2                     0
##   fatalities_00_14
## 1                0
## 2               88
## 3                0
## 4                0
## 5                0

Queries done on the MongoDB collection

First query gets the first 5 records where the avail_seat_kn_per_week is greater than 500,000,000

query1 <- my_col$find('{"avail_seat_km_per_week": {"$gt": 500000000}}', limit = 5)

query1
##       airline avail_seat_km_per_week incidents_85_99 fatal_accidents_85_99
## 1   Aeroflot*             1197672318              76                    14
## 2 Aeromexico*              596871813               3                     1
## 3  Air Canada             1865253802               2                     0
## 4  Air France             3004002661              14                     4
## 5  Air India*              869253552               2                     1
##   fatalities_85_99 incidents_00_14 fatal_accidents_00_14 fatalities_00_14
## 1              128               6                     1               88
## 2               64               5                     0                0
## 3                0               2                     0                0
## 4               79               6                     2              337
## 5              329               4                     1              158

The next query gets records of all airlines starting with the letter C

query2 <- my_col$find('{"airline": {"$regex": "^C"}}')

query2
##           airline avail_seat_km_per_week incidents_85_99 fatal_accidents_85_99
## 1 Cathay Pacific*             2582459303               0                     0
## 2  China Airlines              813216487              12                     6
## 3          Condor              417982610               2                     1
## 4            COPA              550491507               3                     1
##   fatalities_85_99 incidents_00_14 fatal_accidents_00_14 fatalities_00_14
## 1                0               2                     0                0
## 2              535               2                     1              225
## 3               16               0                     0                0
## 4               47               0                     0                0

The third query gets record of first 5 airlines that had at least 1 incidents_85_99.

Note:, since the data type of incidents_85_99 stored in the mongodb collection is chr, the value 0 was enclosed in double quotation.

query3 <- my_col$find('{"incidents_85_99": {"$gt": "0"}}', fields = '{"airline": true, "incidents_85_99": true}', limit=5)

query3
##   _id               airline incidents_85_99
## 1   1            Aer Lingus               2
## 2   2             Aeroflot*              76
## 3   3 Aerolineas Argentinas               6
## 4   4           Aeromexico*               3
## 5   5            Air Canada               2

Converting MongoDB collection to R data frame

df <- as.data.frame(my_col$find())
tail(df, 5)
##                       airline avail_seat_km_per_week incidents_85_99
## 52      United / Continental*             7139291291              19
## 53 US Airways / America West*             2455687887              16
## 54           Vietnam Airlines              625084918               7
## 55            Virgin Atlantic             1005248585               1
## 56            Xiamen Airlines              430462962               9
##    fatal_accidents_85_99 fatalities_85_99 incidents_00_14 fatal_accidents_00_14
## 52                     8              319              14                     2
## 53                     7              224              11                     2
## 54                     3              171               1                     0
## 55                     0                0               0                     0
## 56                     1               82               2                     0
##    fatalities_00_14
## 52              109
## 53               23
## 54                0
## 55                0
## 56                0

we can check the df for missing values using the code below.

which(is.na(df))
## integer(0)

From the result above, we see that there are no missing values. We can also check the total number of records with complete cases (that is all fields of that record has no missing value)

nrow(df[complete.cases(df),])
## [1] 56

We can convert some of the columns to the appropriateb data type

cols.num <- c("avail_seat_km_per_week", "incidents_85_99", "fatal_accidents_85_99", "fatalities_85_99", "incidents_00_14", "fatal_accidents_00_14", "fatalities_00_14")

df[cols.num] <- sapply(df[cols.num], as.numeric)

To confirm that the above data type change was effected, we check the class of each column.

sapply(df, class)
##                airline avail_seat_km_per_week        incidents_85_99 
##            "character"              "numeric"              "numeric" 
##  fatal_accidents_85_99       fatalities_85_99        incidents_00_14 
##              "numeric"              "numeric"              "numeric" 
##  fatal_accidents_00_14       fatalities_00_14 
##              "numeric"              "numeric"

Now a summary of the dataframe

summary(df)
##    airline          avail_seat_km_per_week incidents_85_99 
##  Length:56          Min.   :2.594e+08      Min.   : 0.000  
##  Class :character   1st Qu.:4.740e+08      1st Qu.: 2.000  
##  Mode  :character   Median :8.029e+08      Median : 4.000  
##                     Mean   :1.385e+09      Mean   : 7.179  
##                     3rd Qu.:1.847e+09      3rd Qu.: 8.000  
##                     Max.   :7.139e+09      Max.   :76.000  
##  fatal_accidents_85_99 fatalities_85_99 incidents_00_14  fatal_accidents_00_14
##  Min.   : 0.000        Min.   :  0.0    Min.   : 0.000   Min.   :0.0000       
##  1st Qu.: 0.000        1st Qu.:  0.0    1st Qu.: 1.000   1st Qu.:0.0000       
##  Median : 1.000        Median : 48.5    Median : 3.000   Median :0.0000       
##  Mean   : 2.179        Mean   :112.4    Mean   : 4.125   Mean   :0.6607       
##  3rd Qu.: 3.000        3rd Qu.:184.2    3rd Qu.: 5.250   3rd Qu.:1.0000       
##  Max.   :14.000        Max.   :535.0    Max.   :24.000   Max.   :3.0000       
##  fatalities_00_14
##  Min.   :  0.00  
##  1st Qu.:  0.00  
##  Median :  0.00  
##  Mean   : 55.52  
##  3rd Qu.: 83.25  
##  Max.   :537.00