### ------ Final assignment

###
##  Boston House Price Dataset
###
## https://search.r-project.org/CRAN/refmans/mlbench/html/BostonHousing.html

# Description
# Housing data for 506 census tracts of Boston from the 1970 census. The dataframe BostonHousing contains the original data by Harrison and Rubinfeld (1979), the dataframe BostonHousing2 the corrected version with additional spatial information (see references below).
# 
# Usage
# data("BostonHousing", package = "mlbench")
# Format
# The original data are 506 observations on 14 variables, medv being the target variable:
#  
# crim   : per capita crime rate by town
# zn       : proportion of residential land zoned for lots over 25,000 sq.ft
# indus : proportion of non-retail business acres per town
# chas   : Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# nox     : nitric oxides concentration (parts per 10 million)
# rm       : average number of rooms per dwelling
# age     : proportion of owner-occupied units built prior to 1940
# dis     : weighted distances to five Boston employment centres
# rad     : index of accessibility to radial highways
# tax     : full-value property-tax rate per USD 10,000
# ptratio : pupil-teacher ratio by town
# b     : 1000(B−0.63)^2, where B is the proportion of blacks by town
# lstat : percentage of lower status of the population
# medv   : median value of owner-occupied homes in USD 1000's
#
# The corrected data set has the following additional columns:
# 
# cmedv corrected median value of owner-occupied homes in USD 1000's
# town  name of town
# tract census tract
# lon   longitude of census tract
# lat   latitude of census tract

library(mlbench)
library(tidyverse)
data(BostonHousing)

## Check missing and data types
library(DataExplorer)
plot_intro(BostonHousing)

dim(BostonHousing)
## [1] 506  14
glimpse(BostonHousing)
## Rows: 506
## Columns: 14
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad     <dbl> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ b       <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
## summary statistics
summary(BostonHousing)
##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio            b              lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00
correlation  <- cor( select( BostonHousing, -chas) )
library(corrplot) 
corrplot.mixed(correlation, order = 'AOE', title = "Correlation plot [Boston House Price]")

# corrplot(correlation, method = 'square', order = 'FPC', type = 'lower', diag = FALSE)
# correlation[ upper.tri( correlation ) ] <- NA
# correlation


## Data visualizatioin
library(GGally)
ggpairs(BostonHousing, columns = 1:7, title = "Pair plot [Boston House Price]")

ggpairs(BostonHousing, columns = 8:14, title = "Pair plot [Boston House Price]")

## Parallel coordinate plots
ggparcoord(BostonHousing, columns = c(1:3, 5:14), groupColumn = "chas", title = "Parallel coordinate plot [Boston House Price]")

plot(medv~lstat,BostonHousing, 
     main = "Home value VS \n lower status population with regression line",
     xlab = "Lower status population( in %)", ylab = "Median home value (in thousand)")
abline( lm(medv~lstat,BostonHousing), col= "red", lwd = 2)

###
##   Abalone Dataset
###
## https://search.r-project.org/CRAN/refmans/AppliedPredictiveModeling/html/abalone.html
## https://archive.ics.uci.edu/dataset/1/abalone

## Predict the age of abalone from physical measurements

## Dataset Information

##Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- a boring and time-consuming task.  Other measurements, which are easier to obtain, are used to predict the age.  Further information, such as weather patterns and location (hence food availability) may be required to solve the problem.

## The Abalone data consist of data from 4177 abalones. 
## The data consist of measurements of the type (male, female and infant), 
## the longest shell measurement, the diameter, height and 
## several weights (whole, shucked, viscera and shell). 
## The outcome is the number of rings. 
## The age of the abalone is the number of rings plus 1.5.

# Length - longest shell measurement (mm)
# Diameter - measured perpendicular to length (mm)
# Height - with meat in shell (mm)
#  
#  WholeWeight - whole abalone weight (g)
# ShuckedWeight - weight of meat only (g)
# VisceraWeight - gut weight, after bleeding (g)
# ShellWeight - weight after being dried (g)
# The response variable, that is the dependent variable in this analysis is:
 
# Rings - number of rings in the shell ((+1.5 gives the age in years)

rm(list=ls())
library(AppliedPredictiveModeling)
library(tidyverse)
data(abalone)
dim(abalone)
## [1] 4177    9
# Change factor levels
levels(abalone$Type) 
## [1] "F" "I" "M"
levels(abalone$Type) <- c("Male", "Female", "Infant")

glimpse(abalone)
## Rows: 4,177
## Columns: 9
## $ Type          <fct> Infant, Infant, Male, Infant, Female, Female, Male, Male…
## $ LongestShell  <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, …
## $ Diameter      <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, …
## $ Height        <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, …
## $ WholeWeight   <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, …
## $ ShuckedWeight <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, …
## $ VisceraWeight <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, …
## $ ShellWeight   <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, …
## $ Rings         <int> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 1…
## Check missing and data types
library(DataExplorer)
plot_intro(abalone)

summary(abalone)
##      Type       LongestShell      Diameter          Height      
##  Male  :1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  Female:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  Infant:1528   Median :0.545   Median :0.4250   Median :0.1400  
##                Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##                3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##                Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##   WholeWeight     ShuckedWeight    VisceraWeight     ShellWeight    
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.0005   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300  
##  Median :0.7995   Median :0.3360   Median :0.1710   Median :0.2340  
##  Mean   :0.8287   Mean   :0.3594   Mean   :0.1806   Mean   :0.2388  
##  3rd Qu.:1.1530   3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.7600   Max.   :1.0050  
##      Rings       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.934  
##  3rd Qu.:11.000  
##  Max.   :29.000
## Data visualizatioin

## Histogram of rings
abalone %>% ggplot(aes(x=Rings))+geom_histogram() + facet_grid(~Type)

##
library(GGally)
ggpairs(abalone, columns = 2:7, title = "Pair plot [Abalone]")

## Parallel coordinate plots
ggparcoord(abalone, columns = c(2:9), 
           groupColumn = "Type", 
           title = "Parallel coordinate plot [Abalone]") +
 theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))