Data Explorer is an R Package with commands to explore datasets efficiently.

By Boxuan Cui, Data Scientist at Smarter Traveler

This script uses 6 commands to explore the dataset

dataset: BostonHousing

variables: begining dataset has 14, ending dataset has 18 which includes 4 transformed variables

package mlbench, dataset BostonHousing from mlbench

library (mlbench)
## Warning: package 'mlbench' was built under R version 3.4.3
data ("BostonHousing", package = "mlbench")

summary (BostonHousing)
##       crim                zn             indus       chas   
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1: 35  
##  Median : 0.25651   Median :  0.00   Median : 9.69          
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14          
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10          
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74          
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio            b         
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
#initial visualization. first 3 commands to explore the data

library (DataExplorer)
## Warning: package 'DataExplorer' was built under R version 3.4.3
plot_missing (BostonHousing) ##Are there missing values and what is the missing data profile?

plot_bar (BostonHousing) ##How does the categorical frequency for each discrete variable look like?

plot_histogram(BostonHousing) ##What is the distribution of each continous variable?

#transform variables based on results from histogram
##Set rad to factor

BostonHousing$rad <- as.factor (BostonHousing$rad)

##Creat new discrete variables, makes binary variables
for (col in c ("crim", "zn", "indus", "b"))
  BostonHousing[[paste0(col, "_d")]] <- as.factor (ggplot2::cut_interval(BostonHousing[[col]], 2))

##Plot bar chart for all discrete variables
plot_bar (BostonHousing)

#Plot variable medv against all other variables
plot_boxplot(BostonHousing, by = "medv")

plot_scatterplot(
  subset(BostonHousing, select = -c(crim, zn, indus, b)), by = "medv", size = 0.5)

#colorcoded correlation plot.  Plot all variables against all other variables
plot_correlation(BostonHousing)