In this project, we are going to use three datasets related to commercial property insurance which will help in exploration of their features in relation to the risk rate. The datasets used are: ### Home_insurance ### CarPrice_Assignment ### Carlifonia-housing
This paper aligns the above datasets in the namely order as it is starting with Home_insurance then lastly Carlifonia-housing.
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(summarytools)
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## system might not have X11 capabilities; in case of errors when using dfSummary(), set st_options(use.x11 = FALSE)
##
## Attaching package: 'summarytools'
##
## The following object is masked from 'package:tibble':
##
## view
library(DataExplorer)
library(readr) # read_csv function
library(knitr)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(plotrix)
##
## Attaching package: 'plotrix'
##
## The following object is masked from 'package:psych':
##
## rescale
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(ggpubr)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
property=read.csv('home_insurance.csv')
head(property)
# check the number of the rows and columns for the data set
coln<-ncol(property)
rown<-nrow(property)
print(paste("Number of columns",coln))
## [1] "Number of columns 66"
print(paste("Number of columns",rown))
## [1] "Number of columns 256136"
# check data set
# also perform a summary of the dataset
str(property)
## 'data.frame': 256136 obs. of 66 variables:
## $ QUOTE_DATE : chr "11/22/2007" "11/22/2007" "11/23/2007" "11/23/2007" ...
## $ COVER_START : chr "22/11/2007" "01/01/2008" "23/11/2007" "12/12/2007" ...
## $ CLAIM3YEARS : chr "N" "N" "N" "N" ...
## $ P1_EMP_STATUS : chr "R" "E" "E" "R" ...
## $ P1_PT_EMP_STATUS : chr "" "" "" "" ...
## $ BUS_USE : chr "N" "Y" "N" "N" ...
## $ CLERICAL : chr "" "N" "" "" ...
## $ AD_BUILDINGS : chr "Y" "Y" "N" "N" ...
## $ RISK_RATED_AREA_B : int 19 25 NA NA 5 NA 1 NA 0 5 ...
## $ SUM_INSURED_BUILDINGS : int 1000000 1000000 0 0 1000000 0 1000000 0 1000000 1000000 ...
## $ NCD_GRANTED_YEARS_B : int 7 6 0 0 7 0 7 0 3 7 ...
## $ AD_CONTENTS : chr "Y" "Y" "Y" "Y" ...
## $ RISK_RATED_AREA_C : int 6 9 12 14 10 8 6 6 0 1 ...
## $ SUM_INSURED_CONTENTS : int 50000 50000 50000 50000 50000 50000 50000 50000 50000 50000 ...
## $ NCD_GRANTED_YEARS_C : int 7 7 7 7 7 7 7 7 3 7 ...
## $ CONTENTS_COVER : chr "Y" "Y" "N" "N" ...
## $ BUILDINGS_COVER : chr "Y" "Y" "Y" "Y" ...
## $ SPEC_SUM_INSURED : int 7500 0 0 0 0 0 0 0 0 0 ...
## $ SPEC_ITEM_PREM : num 44.4 0 0 0 0 ...
## $ UNSPEC_HRP_PREM : num 12.4 24.6 0 0 19.8 ...
## $ P1_DOB : chr "15/06/1939" "20/05/1970" "10/06/1947" "16/12/1925" ...
## $ P1_MAR_STATUS : chr "O" "M" "S" "W" ...
## $ P1_POLICY_REFUSED : chr "N" "N" "N" "N" ...
## $ P1_SEX : chr "M" "M" "M" "F" ...
## $ APPR_ALARM : chr "N" "N" "Y" "N" ...
## $ APPR_LOCKS : chr "Y" "N" "Y" "Y" ...
## $ BEDROOMS : int 3 3 2 2 3 1 2 2 3 3 ...
## $ ROOF_CONSTRUCTION : int 11 11 11 11 11 11 11 11 11 11 ...
## $ WALL_CONSTRUCTION : int 15 15 15 15 15 15 15 15 15 15 ...
## $ FLOODING : chr "Y" "Y" "Y" "Y" ...
## $ LISTED : int 3 3 3 3 3 3 3 3 3 3 ...
## $ MAX_DAYS_UNOCC : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NEIGH_WATCH : chr "N" "N" "Y" "N" ...
## $ OCC_STATUS : chr "PH" "PH" "PH" "PH" ...
## $ OWNERSHIP_TYPE : int 8 3 8 18 8 18 8 12 8 8 ...
## $ PAYING_GUESTS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PROP_TYPE : int 10 2 9 19 1 7 9 7 19 10 ...
## $ SAFE_INSTALLED : chr "Y" "N" "N" "N" ...
## $ SEC_DISC_REQ : chr "Y" "N" "Y" "Y" ...
## $ SUBSIDENCE : chr "N" "N" "N" "N" ...
## $ YEARBUILT : int 1960 1960 1946 1870 1960 1960 1980 1990 1960 1946 ...
## $ CAMPAIGN_DESC : logi NA NA NA NA NA NA ...
## $ PAYMENT_METHOD : chr "PureDD" "PureDD" "PureDD" "NonDD" ...
## $ PAYMENT_FREQUENCY : int NA NA NA NA NA NA NA NA NA NA ...
## $ LEGAL_ADDON_PRE_REN : chr "N" "Y" "Y" "N" ...
## $ LEGAL_ADDON_POST_REN : chr "N" "Y" "Y" "N" ...
## $ HOME_EM_ADDON_PRE_REN : chr "N" "N" "N" "N" ...
## $ HOME_EM_ADDON_POST_REN: chr "N" "N" "N" "N" ...
## $ GARDEN_ADDON_PRE_REN : chr "N" "N" "N" "N" ...
## $ GARDEN_ADDON_POST_REN : chr "N" "N" "N" "N" ...
## $ KEYCARE_ADDON_PRE_REN : chr "N" "N" "N" "N" ...
## $ KEYCARE_ADDON_POST_REN: chr "N" "N" "N" "N" ...
## $ HP1_ADDON_PRE_REN : chr "N" "N" "N" "N" ...
## $ HP1_ADDON_POST_REN : chr "N" "N" "N" "N" ...
## $ HP2_ADDON_PRE_REN : chr "N" "N" "N" "N" ...
## $ HP2_ADDON_POST_REN : chr "N" "N" "N" "N" ...
## $ HP3_ADDON_PRE_REN : chr "N" "N" "N" "N" ...
## $ HP3_ADDON_POST_REN : chr "N" "N" "N" "N" ...
## $ MTA_FLAG : chr "N" "Y" "Y" "N" ...
## $ MTA_FAP : num NA 308.8 52.6 NA NA ...
## $ MTA_APRP : num NA -9.27 52.65 NA NA ...
## $ MTA_DATE : chr "" "" "03/11/2010" "" ...
## $ LAST_ANN_PREM_GROSS : num 274.8 308.8 52.6 54.2 244.6 ...
## $ POL_STATUS : chr "Lapsed" "Live" "Live" "Live" ...
## $ i : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Police : chr "P000001" "P000002" "P000003" "P000004" ...
property$CAMPAIGN_DESC <- NULL
property$i <- NULL
total <- nrow(property)
total
## [1] 256136
First we need to remove all unwanted or incomplete data records from the dataset for consistent exploration. These include null values or incorrect formated values. From that the data will be ready for exploration and visualization.
options(repr.plot.width=8, repr.plot.height=3)
# look for missing values using the DataExplorer package
plot_missing(property)
# I want to extract a dataframe showing, thus first i need to convert my dataset into data table as below.
property.data <- data.table(property)
status_groups <- property.data[, .(count=.N, percent=round((.N/total)*100, 2)), by = POL_STATUS]
status_groups <- status_groups[order(count, decreasing = TRUE)]
status_groups
pieLabels <- paste(status_groups$POL_STATUS,' ', status_groups$percent, '%')
pie3D(status_groups$count,labels=pieLabels,explode=0.1, radius=0.8,height=0.1, col=rainbow(length(status_groups$POL_STATUS)),
main="Pie Chart of Policy Status ")
status_groups[POL_STATUS != 'Lapsed', POL_STATUS:= "Non Resiliated"]
status_groups[POL_STATUS == 'Lapsed', POL_STATUS:= "Resiliated"]
status_groups <- status_groups[, .(count=sum(count), percent = round((.N*100)/sum(count), 2)), by = POL_STATUS]
status_groups[,percent := round((count*100)/sum(count), 2)]
status_groups <- status_groups[order(count, decreasing = TRUE)]
status_groups
pieLabels <- paste(status_groups$POL_STATUS,' ', status_groups$percent, '%')
pie <- pie3D(status_groups$percent,labels=pieLabels,explode=0.1, radius=0.8,height=0.1, col=rainbow(length(status_groups$POL_STATUS)),
main="Pie Chart of Resiliation")
We were curious to discover the most covered policy among the others. we thus decided to wrangle the data to find out adding a new column that show the total coverage of the policy
dt <- property.data[, .(SUM_INSURED_BUILDINGS, SUM_INSURED_CONTENTS, SPEC_SUM_INSURED)]
name <- 'total_coverage'
property.data[, (name):= SUM_INSURED_BUILDINGS+ SUM_INSURED_CONTENTS+ SPEC_SUM_INSURED]
ordered_table <- property.data[order(total_coverage, decreasing = TRUE), .(Police, SUM_INSURED_BUILDINGS, SUM_INSURED_CONTENTS, SPEC_SUM_INSURED, total_coverage)]
ordered_table <- data.table(ordered_table)
head(ordered_table, 8)
month_order <- c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
day_order <- c('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun')
property.data$quotemonth_n <- month(as.POSIXlt(property.data$QUOTE_DATE, format="%m/%d/%Y"))
property.data$covermonth_n <- month(as.POSIXlt(property.data$COVER_START, format="%d/%m/%Y"))
#i name the columns
quotesmonthDF <- data.frame(month_n = property.data$quotemonth_n )
coversmonthDF <- data.frame(month_n = property.data$covermonth_n)
#head(quotesmonthDF, 2)
#i avoid the null values and make the group by each month to get the monthly total
quotesmonthgroup <- data.table(quotesmonthDF)
quotesmonthgroup <-quotesmonthgroup[month_n <= 12]
quotesmonthgroup <-quotesmonthgroup[(order(month_n)), .(count=.N), by=month_n]
coversmonthgroup <- data.table(coversmonthDF)
coversmonthgroup <-coversmonthgroup[month_n <= 12]
coversmonthgroup <-coversmonthgroup[(order(month_n)), .(count=.N), by=month_n]
#i add the name of the month
quotesmonthgroup$month_s <- month_order[quotesmonthgroup$month_n ]
coversmonthgroup$month_s <- month_order[coversmonthgroup$month_n ]
head(quotesmonthgroup, 12)
head(coversmonthgroup, 12)
property.data$quotemonth_s <- month_order[property.data$quotemonth_n ]
property.data$covermonth_s <- month_order[property.data$covermonth_n ]
ggbarplot(quotesmonthgroup, x= "month_s" , y= "count", xlab="Month", ylab ="quantity policies", label=TRUE, title = "Most successful months in Quotation date", label.pos = "out", fill = "month_s", color = "month_s")
### It appears that January is the most popular month when it comes to
policies quotations. This is maybe for the salary bonus that employees
get in christmas and the clients makes theirs plan come true making the
quote
ggbarplot(coversmonthgroup, x= "month_s" , y= "count", xlab="Month", ylab ="quantity policies", fill = "month_s", label=TRUE, title = "Most successful months in Coverage date", label.pos = "out")
boxplotDF <- property.data[covermonth_n <= 12, .(covermonth_n, covermonth_s, total_coverage)]
ggboxplot(boxplotDF, x = "covermonth_s", y = "total_coverage", xlab="Coverage month", ylab="total coverage amount", width = 0.8, fill="covermonth_s", order=month_order)
year_built <- property.data[property.data$YEARBUILT != 'null' & property.data$YEARBUILT > 0, .(count=.N), by = YEARBUILT]
year_built <- year_built[order(YEARBUILT)]
year_built
qplot(x = year_built$YEARBUILT, y = year_built$count, xlab="Year of building construction", ylab="Number of Policies", main = "Number of buildings by year construction" , geom="line")
#Second dataset Car
car.data <- read.csv('CarPrice_Assignment.csv')
head(car.data)
str(car.data)
## 'data.frame': 205 obs. of 26 variables:
## $ car_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ symboling : int 3 3 1 2 2 2 1 1 1 0 ...
## $ CarName : chr "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
## $ fueltype : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ doornumber : chr "two" "two" "two" "four" ...
## $ carbody : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drivewheel : chr "rwd" "rwd" "rwd" "fwd" ...
## $ enginelocation : chr "front" "front" "front" "front" ...
## $ wheelbase : num 88.6 88.6 94.5 99.8 99.4 ...
## $ carlength : num 169 169 171 177 177 ...
## $ carwidth : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ carheight : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ curbweight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ enginetype : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ cylindernumber : chr "four" "four" "six" "four" ...
## $ enginesize : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuelsystem : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ boreratio : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ compressionratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ horsepower : int 111 111 154 102 115 110 110 110 140 160 ...
## $ peakrpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ citympg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ highwaympg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : num 13495 16500 16500 13950 17450 ...
total <- nrow(car.data)
total
## [1] 205
# I want to extract a dataframe showing, thus first i need to convert my dataset into data table as below.
car.df <- data.table(car.data)
fuel_groups <- car.df[, .(count=.N, percent=round((.N/total)*100, 2)), by = fueltype]
fuel_groups <- fuel_groups[order(count, decreasing = TRUE)]
fuel_groups
pieLabels <- paste(fuel_groups$fueltype,' ', fuel_groups$percent, '%')
pie3D(fuel_groups$count,labels=pieLabels,explode=0.1, radius=0.8,height=0.1, col=rainbow(length(fuel_groups$fueltype)),
main="Pie Chart of Type of Car Fuel ")
ggplot(car.df, aes(factor(carbody ),
fill = factor(carbody ))) +
geom_bar()
### sedan is the top car type prefered.
ggplot(car.df, aes(factor(fueltype),
fill = factor(fueltype))) +
geom_bar()
ggplot(car.df, aes(factor(symboling),
fill = factor(symboling))) +
geom_bar()
### It seems that the symboling with 0 and 1 values have high number of
rows (i.e. They are most sold.)
car.df$symboling <- as.character( car.df$symboling )
ggplot(car.df, aes(x =symboling, y = price, fill = symboling)) +
geom_boxplot()
carlifonia <- read.csv('Carlifonia-housing.csv')
head(carlifonia)
head(carlifonia)
summary(carlifonia)
## longitude latitude housing_median_age total_rooms
## Min. :-124.3 Min. :32.54 Min. : 1.00 Min. : 2
## 1st Qu.:-121.8 1st Qu.:33.93 1st Qu.:18.00 1st Qu.: 1448
## Median :-118.5 Median :34.26 Median :29.00 Median : 2127
## Mean :-119.6 Mean :35.63 Mean :28.64 Mean : 2636
## 3rd Qu.:-118.0 3rd Qu.:37.71 3rd Qu.:37.00 3rd Qu.: 3148
## Max. :-114.3 Max. :41.95 Max. :52.00 Max. :39320
##
## total_bedrooms population households median_income
## Min. : 1.0 Min. : 3 Min. : 1.0 Min. : 0.4999
## 1st Qu.: 296.0 1st Qu.: 787 1st Qu.: 280.0 1st Qu.: 2.5634
## Median : 435.0 Median : 1166 Median : 409.0 Median : 3.5348
## Mean : 537.9 Mean : 1425 Mean : 499.5 Mean : 3.8707
## 3rd Qu.: 647.0 3rd Qu.: 1725 3rd Qu.: 605.0 3rd Qu.: 4.7432
## Max. :6445.0 Max. :35682 Max. :6082.0 Max. :15.0001
## NA's :207
## median_house_value ocean_proximity
## Min. : 14999 Length:20640
## 1st Qu.:119600 Class :character
## Median :179700 Mode :character
## Mean :206856
## 3rd Qu.:264725
## Max. :500001
##
str(carlifonia)
## 'data.frame': 20640 obs. of 10 variables:
## $ longitude : num -122 -122 -122 -122 -122 ...
## $ latitude : num 37.9 37.9 37.9 37.9 37.9 ...
## $ housing_median_age: num 41 21 52 52 52 52 52 52 42 52 ...
## $ total_rooms : num 880 7099 1467 1274 1627 ...
## $ total_bedrooms : num 129 1106 190 235 280 ...
## $ population : num 322 2401 496 558 565 ...
## $ households : num 126 1138 177 219 259 ...
## $ median_income : num 8.33 8.3 7.26 5.64 3.85 ...
## $ median_house_value: num 452600 358500 352100 341300 342200 ...
## $ ocean_proximity : chr "NEAR BAY" "NEAR BAY" "NEAR BAY" "NEAR BAY" ...
First we need to remove all unwanted or incomplete data records from the dataset for consistent exploration. These include null values or incorrect formated values. From that the data will be ready for exploration and visualization.
options(repr.plot.width=8, repr.plot.height=3)
# look for missing values using the DataExplorer package
plot_missing(carlifonia)
## The data is clean and ready for use. There is no missing values in
the dataset features.
#To plot scatter plot
pairs(carlifonia[1:9])
#To identify Correlation
cor(carlifonia[1:9])
## longitude latitude housing_median_age total_rooms
## longitude 1.00000000 -0.92466443 -0.10819681 0.04456798
## latitude -0.92466443 1.00000000 0.01117267 -0.03609960
## housing_median_age -0.10819681 0.01117267 1.00000000 -0.36126220
## total_rooms 0.04456798 -0.03609960 -0.36126220 1.00000000
## total_bedrooms NA NA NA NA
## population 0.09977322 -0.10878475 -0.29624424 0.85712597
## households 0.05531009 -0.07103543 -0.30291601 0.91848449
## median_income -0.01517587 -0.07980913 -0.11903399 0.19804965
## median_house_value -0.04596662 -0.14416028 0.10562341 0.13415311
## total_bedrooms population households median_income
## longitude NA 0.099773223 0.05531009 -0.015175865
## latitude NA -0.108784747 -0.07103543 -0.079809127
## housing_median_age NA -0.296244240 -0.30291601 -0.119033990
## total_rooms NA 0.857125973 0.91848449 0.198049645
## total_bedrooms 1 NA NA NA
## population NA 1.000000000 0.90722227 0.004834346
## households NA 0.907222266 1.00000000 0.013033052
## median_income NA 0.004834346 0.01303305 1.000000000
## median_house_value NA -0.024649679 0.06584265 0.688075208
## median_house_value
## longitude -0.04596662
## latitude -0.14416028
## housing_median_age 0.10562341
## total_rooms 0.13415311
## total_bedrooms NA
## population -0.02464968
## households 0.06584265
## median_income 0.68807521
## median_house_value 1.00000000
#To remove Null Values
table(is.na(carlifonia$total_bedrooms))
##
## FALSE TRUE
## 20433 207
carlifonia$total_bedrooms[is.na(carlifonia$total_bedrooms)]=median(carlifonia$total_bedrooms,na.rm = TRUE)
table(is.na(carlifonia$total_bedrooms))
##
## FALSE
## 20640
#TO handle Categorical Variables
carlifonia$ocean_proximity=as.numeric(factor(carlifonia$ocean_proximity,
levels = c('<1H OCEAN','INLAND','ISLAND','NEAR BAY', 'NEAR OCEAN'),
labels = c(1,2,3,4,5)))
head(carlifonia$ocean_proximity)
## [1] 4 4 4 4 4 4
#TO handle Categorical Variables
carlifonia$ocean_proximity=as.numeric(factor(carlifonia$ocean_proximity,
levels = c('<1H OCEAN','INLAND','ISLAND','NEAR BAY', 'NEAR OCEAN'),
labels = c(1,2,3,4,5)))
#create histogram of values for price
ggplot(data=carlifonia, aes(x=median_house_value)) +
geom_histogram(fill="steelblue", color="black") +
ggtitle("Histogram of Median House Values")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#create scatterplot of median house value, grouped by ocean proximity
#ggplot(data=carlifonia, aes(x=ocean_proximity, y=median_house_value)) +
# geom_boxplot(fill="steelblue")
#create scatterplot of total_rooms vs. median_house_value, using ocean_proximity as color variable
ggplot(data=carlifonia, aes(x=total_rooms, y=median_house_value, color=ocean_proximity)) +
geom_point()