### ------ Final assignment
###
## Boston House Price Dataset
###
## https://search.r-project.org/CRAN/refmans/mlbench/html/BostonHousing.html
# Description
# Housing data for 506 census tracts of Boston from the 1970 census. The dataframe BostonHousing contains the original data by Harrison and Rubinfeld (1979), the dataframe BostonHousing2 the corrected version with additional spatial information (see references below).
#
# Usage
# data("BostonHousing", package = "mlbench")
# Format
# The original data are 506 observations on 14 variables, medv being the target variable:
#
# crim : per capita crime rate by town
# zn : proportion of residential land zoned for lots over 25,000 sq.ft
# indus : proportion of non-retail business acres per town
# chas : Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# nox : nitric oxides concentration (parts per 10 million)
# rm : average number of rooms per dwelling
# age : proportion of owner-occupied units built prior to 1940
# dis : weighted distances to five Boston employment centres
# rad : index of accessibility to radial highways
# tax : full-value property-tax rate per USD 10,000
# ptratio : pupil-teacher ratio by town
# b : 1000(B−0.63)^2, where B is the proportion of blacks by town
# lstat : percentage of lower status of the population
# medv : median value of owner-occupied homes in USD 1000's
#
# The corrected data set has the following additional columns:
#
# cmedv corrected median value of owner-occupied homes in USD 1000's
# town name of town
# tract census tract
# lon longitude of census tract
# lat latitude of census tract
library(mlbench)
library(tidyverse)
data(BostonHousing)
## Check missing and data types
library(DataExplorer)
plot_intro(BostonHousing)

dim(BostonHousing)
## [1] 506 14
glimpse(BostonHousing)
## Rows: 506
## Columns: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad <dbl> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ b <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
## summary statistics
summary(BostonHousing)
## crim zn indus chas nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1: 35 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio b lstat
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :330.0 Median :19.05 Median :391.44 Median :11.36
## Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
correlation <- cor( select( BostonHousing, -chas) )
library(corrplot)
corrplot.mixed(correlation, order = 'AOE', title = "Correlation plot [Boston House Price]")

# corrplot(correlation, method = 'square', order = 'FPC', type = 'lower', diag = FALSE)
# correlation[ upper.tri( correlation ) ] <- NA
# correlation
## Data visualizatioin
library(GGally)
ggpairs(BostonHousing, columns = 1:7, title = "Pair plot [Boston House Price]")

ggpairs(BostonHousing, columns = 8:14, title = "Pair plot [Boston House Price]")

## Parallel coordinate plots
ggparcoord(BostonHousing, columns = c(1:3, 5:14), groupColumn = "chas", title = "Parallel coordinate plot [Boston House Price]")

plot(medv~lstat,BostonHousing,
main = "Home value VS \n lower status population with regression line",
xlab = "Lower status population( in %)", ylab = "Median home value (in thousand)")
abline( lm(medv~lstat,BostonHousing), col= "red", lwd = 2)

###
## Abalone Dataset
###
## https://search.r-project.org/CRAN/refmans/AppliedPredictiveModeling/html/abalone.html
## https://archive.ics.uci.edu/dataset/1/abalone
## Predict the age of abalone from physical measurements
## Dataset Information
##Predicting the age of abalone from physical measurements. The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- a boring and time-consuming task. Other measurements, which are easier to obtain, are used to predict the age. Further information, such as weather patterns and location (hence food availability) may be required to solve the problem.
## The Abalone data consist of data from 4177 abalones.
## The data consist of measurements of the type (male, female and infant),
## the longest shell measurement, the diameter, height and
## several weights (whole, shucked, viscera and shell).
## The outcome is the number of rings.
## The age of the abalone is the number of rings plus 1.5.
# Length - longest shell measurement (mm)
# Diameter - measured perpendicular to length (mm)
# Height - with meat in shell (mm)
#
# WholeWeight - whole abalone weight (g)
# ShuckedWeight - weight of meat only (g)
# VisceraWeight - gut weight, after bleeding (g)
# ShellWeight - weight after being dried (g)
# The response variable, that is the dependent variable in this analysis is:
# Rings - number of rings in the shell ((+1.5 gives the age in years)
rm(list=ls())
library(AppliedPredictiveModeling)
library(tidyverse)
data(abalone)
dim(abalone)
## [1] 4177 9
# Change factor levels
levels(abalone$Type)
## [1] "F" "I" "M"
levels(abalone$Type) <- c("Male", "Female", "Infant")
glimpse(abalone)
## Rows: 4,177
## Columns: 9
## $ Type <fct> Infant, Infant, Male, Infant, Female, Female, Male, Male…
## $ LongestShell <dbl> 0.455, 0.350, 0.530, 0.440, 0.330, 0.425, 0.530, 0.545, …
## $ Diameter <dbl> 0.365, 0.265, 0.420, 0.365, 0.255, 0.300, 0.415, 0.425, …
## $ Height <dbl> 0.095, 0.090, 0.135, 0.125, 0.080, 0.095, 0.150, 0.125, …
## $ WholeWeight <dbl> 0.5140, 0.2255, 0.6770, 0.5160, 0.2050, 0.3515, 0.7775, …
## $ ShuckedWeight <dbl> 0.2245, 0.0995, 0.2565, 0.2155, 0.0895, 0.1410, 0.2370, …
## $ VisceraWeight <dbl> 0.1010, 0.0485, 0.1415, 0.1140, 0.0395, 0.0775, 0.1415, …
## $ ShellWeight <dbl> 0.150, 0.070, 0.210, 0.155, 0.055, 0.120, 0.330, 0.260, …
## $ Rings <int> 15, 7, 9, 10, 7, 8, 20, 16, 9, 19, 14, 10, 11, 10, 10, 1…
## Check missing and data types
library(DataExplorer)
plot_intro(abalone)

summary(abalone)
## Type LongestShell Diameter Height
## Male :1307 Min. :0.075 Min. :0.0550 Min. :0.0000
## Female:1342 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## Infant:1528 Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## WholeWeight ShuckedWeight VisceraWeight ShellWeight
## Min. :0.0020 Min. :0.0010 Min. :0.0005 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.0935 1st Qu.:0.1300
## Median :0.7995 Median :0.3360 Median :0.1710 Median :0.2340
## Mean :0.8287 Mean :0.3594 Mean :0.1806 Mean :0.2388
## 3rd Qu.:1.1530 3rd Qu.:0.5020 3rd Qu.:0.2530 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.7600 Max. :1.0050
## Rings
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.934
## 3rd Qu.:11.000
## Max. :29.000
## Data visualizatioin
## Histogram of rings
abalone %>% ggplot(aes(x=Rings))+geom_histogram() + facet_grid(~Type)

##
library(GGally)
ggpairs(abalone, columns = 2:7, title = "Pair plot [Abalone]")

## Parallel coordinate plots
ggparcoord(abalone, columns = c(2:9),
groupColumn = "Type",
title = "Parallel coordinate plot [Abalone]") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
