Predicting commercial property insurance risk factor

In this project, we are going to use three datasets related to commercial property insurance which will help in exploration of their features in relation to the risk rate. The datasets used are: ### Home_insurance ### CarPrice_Assignment ### Carlifonia-housing

This paper aligns the above datasets in the namely order as it is starting with Home_insurance then lastly Carlifonia-housing.

Load Libraries Required

library(ggplot2)
library(tidyverse) 
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(summarytools)
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## system might not have X11 capabilities; in case of errors when using dfSummary(), set st_options(use.x11 = FALSE)
## 
## Attaching package: 'summarytools'
## 
## The following object is masked from 'package:tibble':
## 
##     view
library(DataExplorer)
library(readr) #  read_csv function
library(knitr)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(plotrix)
## 
## Attaching package: 'plotrix'
## 
## The following object is masked from 'package:psych':
## 
##     rescale
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(ggpubr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Loading the first dataset “home_insurance”

Showing the 6 first records of the Home Insurance to get more familiar with the data

property=read.csv('home_insurance.csv') 

head(property)

Understanding the Data

# check the number of the rows and columns for the data set
coln<-ncol(property)
rown<-nrow(property)
print(paste("Number of columns",coln))
## [1] "Number of columns 66"
print(paste("Number of columns",rown))
## [1] "Number of columns 256136"
# check data set
# also perform a summary of the dataset
str(property)
## 'data.frame':    256136 obs. of  66 variables:
##  $ QUOTE_DATE            : chr  "11/22/2007" "11/22/2007" "11/23/2007" "11/23/2007" ...
##  $ COVER_START           : chr  "22/11/2007" "01/01/2008" "23/11/2007" "12/12/2007" ...
##  $ CLAIM3YEARS           : chr  "N" "N" "N" "N" ...
##  $ P1_EMP_STATUS         : chr  "R" "E" "E" "R" ...
##  $ P1_PT_EMP_STATUS      : chr  "" "" "" "" ...
##  $ BUS_USE               : chr  "N" "Y" "N" "N" ...
##  $ CLERICAL              : chr  "" "N" "" "" ...
##  $ AD_BUILDINGS          : chr  "Y" "Y" "N" "N" ...
##  $ RISK_RATED_AREA_B     : int  19 25 NA NA 5 NA 1 NA 0 5 ...
##  $ SUM_INSURED_BUILDINGS : int  1000000 1000000 0 0 1000000 0 1000000 0 1000000 1000000 ...
##  $ NCD_GRANTED_YEARS_B   : int  7 6 0 0 7 0 7 0 3 7 ...
##  $ AD_CONTENTS           : chr  "Y" "Y" "Y" "Y" ...
##  $ RISK_RATED_AREA_C     : int  6 9 12 14 10 8 6 6 0 1 ...
##  $ SUM_INSURED_CONTENTS  : int  50000 50000 50000 50000 50000 50000 50000 50000 50000 50000 ...
##  $ NCD_GRANTED_YEARS_C   : int  7 7 7 7 7 7 7 7 3 7 ...
##  $ CONTENTS_COVER        : chr  "Y" "Y" "N" "N" ...
##  $ BUILDINGS_COVER       : chr  "Y" "Y" "Y" "Y" ...
##  $ SPEC_SUM_INSURED      : int  7500 0 0 0 0 0 0 0 0 0 ...
##  $ SPEC_ITEM_PREM        : num  44.4 0 0 0 0 ...
##  $ UNSPEC_HRP_PREM       : num  12.4 24.6 0 0 19.8 ...
##  $ P1_DOB                : chr  "15/06/1939" "20/05/1970" "10/06/1947" "16/12/1925" ...
##  $ P1_MAR_STATUS         : chr  "O" "M" "S" "W" ...
##  $ P1_POLICY_REFUSED     : chr  "N" "N" "N" "N" ...
##  $ P1_SEX                : chr  "M" "M" "M" "F" ...
##  $ APPR_ALARM            : chr  "N" "N" "Y" "N" ...
##  $ APPR_LOCKS            : chr  "Y" "N" "Y" "Y" ...
##  $ BEDROOMS              : int  3 3 2 2 3 1 2 2 3 3 ...
##  $ ROOF_CONSTRUCTION     : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ WALL_CONSTRUCTION     : int  15 15 15 15 15 15 15 15 15 15 ...
##  $ FLOODING              : chr  "Y" "Y" "Y" "Y" ...
##  $ LISTED                : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ MAX_DAYS_UNOCC        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NEIGH_WATCH           : chr  "N" "N" "Y" "N" ...
##  $ OCC_STATUS            : chr  "PH" "PH" "PH" "PH" ...
##  $ OWNERSHIP_TYPE        : int  8 3 8 18 8 18 8 12 8 8 ...
##  $ PAYING_GUESTS         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PROP_TYPE             : int  10 2 9 19 1 7 9 7 19 10 ...
##  $ SAFE_INSTALLED        : chr  "Y" "N" "N" "N" ...
##  $ SEC_DISC_REQ          : chr  "Y" "N" "Y" "Y" ...
##  $ SUBSIDENCE            : chr  "N" "N" "N" "N" ...
##  $ YEARBUILT             : int  1960 1960 1946 1870 1960 1960 1980 1990 1960 1946 ...
##  $ CAMPAIGN_DESC         : logi  NA NA NA NA NA NA ...
##  $ PAYMENT_METHOD        : chr  "PureDD" "PureDD" "PureDD" "NonDD" ...
##  $ PAYMENT_FREQUENCY     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LEGAL_ADDON_PRE_REN   : chr  "N" "Y" "Y" "N" ...
##  $ LEGAL_ADDON_POST_REN  : chr  "N" "Y" "Y" "N" ...
##  $ HOME_EM_ADDON_PRE_REN : chr  "N" "N" "N" "N" ...
##  $ HOME_EM_ADDON_POST_REN: chr  "N" "N" "N" "N" ...
##  $ GARDEN_ADDON_PRE_REN  : chr  "N" "N" "N" "N" ...
##  $ GARDEN_ADDON_POST_REN : chr  "N" "N" "N" "N" ...
##  $ KEYCARE_ADDON_PRE_REN : chr  "N" "N" "N" "N" ...
##  $ KEYCARE_ADDON_POST_REN: chr  "N" "N" "N" "N" ...
##  $ HP1_ADDON_PRE_REN     : chr  "N" "N" "N" "N" ...
##  $ HP1_ADDON_POST_REN    : chr  "N" "N" "N" "N" ...
##  $ HP2_ADDON_PRE_REN     : chr  "N" "N" "N" "N" ...
##  $ HP2_ADDON_POST_REN    : chr  "N" "N" "N" "N" ...
##  $ HP3_ADDON_PRE_REN     : chr  "N" "N" "N" "N" ...
##  $ HP3_ADDON_POST_REN    : chr  "N" "N" "N" "N" ...
##  $ MTA_FLAG              : chr  "N" "Y" "Y" "N" ...
##  $ MTA_FAP               : num  NA 308.8 52.6 NA NA ...
##  $ MTA_APRP              : num  NA -9.27 52.65 NA NA ...
##  $ MTA_DATE              : chr  "" "" "03/11/2010" "" ...
##  $ LAST_ANN_PREM_GROSS   : num  274.8 308.8 52.6 54.2 244.6 ...
##  $ POL_STATUS            : chr  "Lapsed" "Live" "Live" "Live" ...
##  $ i                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Police                : chr  "P000001" "P000002" "P000003" "P000004" ...
property$CAMPAIGN_DESC <- NULL
property$i <- NULL
total <- nrow(property)
total
## [1] 256136

Data cleaning and preprocessing

First we need to remove all unwanted or incomplete data records from the dataset for consistent exploration. These include null values or incorrect formated values. From that the data will be ready for exploration and visualization.

options(repr.plot.width=8, repr.plot.height=3)
# look for missing values using the DataExplorer package
plot_missing(property)

The figure above shows the state of missing values in every column of home_housing dataset.

Exploratory Data Analysis

We attempted at learning more about this features and performing appropriate wrangling steps to arrive at a form that is more suitable for analysis.

# I want to extract a dataframe showing, thus first i need to convert my dataset into data table as below.

property.data <- data.table(property)

status_groups <- property.data[, .(count=.N, percent=round((.N/total)*100, 2)), by = POL_STATUS]
status_groups <- status_groups[order(count, decreasing = TRUE)]
status_groups
pieLabels <- paste(status_groups$POL_STATUS,' ', status_groups$percent, '%')
pie3D(status_groups$count,labels=pieLabels,explode=0.1, radius=0.8,height=0.1, col=rainbow(length(status_groups$POL_STATUS)),
    main="Pie Chart of Policy Status ")

From the above diagram, We can see that the majority of the policies have a status of live (51.6%).

status_groups[POL_STATUS != 'Lapsed', POL_STATUS:= "Non Resiliated"]
status_groups[POL_STATUS == 'Lapsed', POL_STATUS:= "Resiliated"]
status_groups <- status_groups[, .(count=sum(count), percent = round((.N*100)/sum(count), 2)), by = POL_STATUS]
status_groups[,percent := round((count*100)/sum(count), 2)]
status_groups <- status_groups[order(count, decreasing = TRUE)]
status_groups
pieLabels <- paste(status_groups$POL_STATUS,' ', status_groups$percent, '%')
pie <- pie3D(status_groups$percent,labels=pieLabels,explode=0.1, radius=0.8,height=0.1, col=rainbow(length(status_groups$POL_STATUS)),
    main="Pie Chart of Resiliation")

Here we can see that 20.5% of the clients return to the policy after cancelled.

Policy best covered

We were curious to discover the most covered policy among the others. we thus decided to wrangle the data to find out adding a new column that show the total coverage of the policy

dt <- property.data[, .(SUM_INSURED_BUILDINGS, SUM_INSURED_CONTENTS, SPEC_SUM_INSURED)]
name <- 'total_coverage'
property.data[, (name):= SUM_INSURED_BUILDINGS+ SUM_INSURED_CONTENTS+ SPEC_SUM_INSURED]
ordered_table <- property.data[order(total_coverage, decreasing = TRUE), .(Police, SUM_INSURED_BUILDINGS, SUM_INSURED_CONTENTS, SPEC_SUM_INSURED, total_coverage)]
ordered_table <- data.table(ordered_table)
head(ordered_table, 8)

From the table above, The Policy with ID P098293 is the most covered policy in almost 1,138k dollars. The P032217 come in a close second with a 1,132k dollars. Policy P028759 is third but this policy has significantly less Valuable Personal Property compared to the two first ones in the list and therefore, a much smaller total coverage.

It seems that the beggining and ending months of the year have the highest number of policies. This can be attributed to the fact that people tend to dedicate their money in the summer to vacances when the kids are out of school, the parents are on vacation and therefore, the policies is more likely to be practically none.

Now i want to know if there are months that tend to be more successful than others. For this i will create a boxplot between the total coverage and the months.

boxplotDF <- property.data[covermonth_n <= 12, .(covermonth_n, covermonth_s, total_coverage)]
ggboxplot(boxplotDF, x = "covermonth_s", y = "total_coverage", xlab="Coverage month", ylab="total coverage amount", width = 0.8, fill="covermonth_s", order=month_order)

We see that effectively the months of January, February, March and November tend to yield the highest median returns. However December does not have a high total coverage, even when this month has the highest number of policies. On the other hand June and August have a not so high coverage and finally the resting months are the least successful months on the aforementioned metrics. Again, the success of the starting and ending months can be attributed to the fact that in summer the people tend to spend their money on vacation stuffs.

Lets check on number of buildings per year

year_built <- property.data[property.data$YEARBUILT != 'null' & property.data$YEARBUILT > 0, .(count=.N), by = YEARBUILT]
year_built <- year_built[order(YEARBUILT)]
year_built
qplot(x = year_built$YEARBUILT, y = year_built$count, xlab="Year of building construction", ylab="Number of Policies", main = "Number of buildings by year construction" , geom="line")

#Second dataset Car

car.data <- read.csv('CarPrice_Assignment.csv') 

head(car.data)

Data cleaning

checking the missing values

The figire above shows the state of missing values in every column of the Car dataset. We can confirm that this dataset is clean and ready for use unlike the first dataset.

Understanding Car dataset

str(car.data)
## 'data.frame':    205 obs. of  26 variables:
##  $ car_ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ symboling       : int  3 3 1 2 2 2 1 1 1 0 ...
##  $ CarName         : chr  "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
##  $ fueltype        : chr  "gas" "gas" "gas" "gas" ...
##  $ aspiration      : chr  "std" "std" "std" "std" ...
##  $ doornumber      : chr  "two" "two" "two" "four" ...
##  $ carbody         : chr  "convertible" "convertible" "hatchback" "sedan" ...
##  $ drivewheel      : chr  "rwd" "rwd" "rwd" "fwd" ...
##  $ enginelocation  : chr  "front" "front" "front" "front" ...
##  $ wheelbase       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ carlength       : num  169 169 171 177 177 ...
##  $ carwidth        : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ carheight       : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curbweight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ enginetype      : chr  "dohc" "dohc" "ohcv" "ohc" ...
##  $ cylindernumber  : chr  "four" "four" "six" "four" ...
##  $ enginesize      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : chr  "mpfi" "mpfi" "mpfi" "mpfi" ...
##  $ boreratio       : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower      : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peakrpm         : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ citympg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highwaympg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num  13495 16500 16500 13950 17450 ...
total <- nrow(car.data)
total
## [1] 205
# I want to extract a dataframe showing, thus first i need to convert my dataset into data table as below.

car.df <- data.table(car.data)

fuel_groups <- car.df[, .(count=.N, percent=round((.N/total)*100, 2)), by =  fueltype]
fuel_groups <- fuel_groups[order(count, decreasing = TRUE)]
fuel_groups
pieLabels <- paste(fuel_groups$fueltype,' ', fuel_groups$percent, '%')
pie3D(fuel_groups$count,labels=pieLabels,explode=0.1, radius=0.8,height=0.1, col=rainbow(length(fuel_groups$fueltype)),
    main="Pie Chart of Type of  Car Fuel ")

From the above chart, We can see that the majority/large % of cars consumes gas as their fueltype (90.24%).

ggplot(car.df, aes(factor(carbody ),
        fill = factor(carbody ))) +
    geom_bar()

### sedan is the top car type prefered.

ggplot(car.df, aes(factor(fueltype),
        fill = factor(fueltype))) +
    geom_bar()

Number of gas fueled cars are more than diesel.

ggplot(car.df, aes(factor(symboling),
        fill = factor(symboling))) +
    geom_bar()

### It seems that the symboling with 0 and 1 values have high number of rows (i.e. They are most sold.)

car.df$symboling <- as.character( car.df$symboling )

ggplot(car.df, aes(x =symboling, y = price, fill = symboling)) + 
  geom_boxplot() 

The cars with -1 symboling seems to be high priced (as it makes sense too, insurance risk rating -1 is quite good). But it seems that symboling with 3 value has the price range similar to -2 value. There is a dip in price at symboling 1.

Loading third dataset

carlifonia <- read.csv('Carlifonia-housing.csv') 

head(carlifonia)

Exploratory Analysis

head(carlifonia)
summary(carlifonia)
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.8   1st Qu.:33.93   1st Qu.:18.00      1st Qu.: 1448  
##  Median :-118.5   Median :34.26   Median :29.00      Median : 2127  
##  Mean   :-119.6   Mean   :35.63   Mean   :28.64      Mean   : 2636  
##  3rd Qu.:-118.0   3rd Qu.:37.71   3rd Qu.:37.00      3rd Qu.: 3148  
##  Max.   :-114.3   Max.   :41.95   Max.   :52.00      Max.   :39320  
##                                                                     
##  total_bedrooms     population      households     median_income    
##  Min.   :   1.0   Min.   :    3   Min.   :   1.0   Min.   : 0.4999  
##  1st Qu.: 296.0   1st Qu.:  787   1st Qu.: 280.0   1st Qu.: 2.5634  
##  Median : 435.0   Median : 1166   Median : 409.0   Median : 3.5348  
##  Mean   : 537.9   Mean   : 1425   Mean   : 499.5   Mean   : 3.8707  
##  3rd Qu.: 647.0   3rd Qu.: 1725   3rd Qu.: 605.0   3rd Qu.: 4.7432  
##  Max.   :6445.0   Max.   :35682   Max.   :6082.0   Max.   :15.0001  
##  NA's   :207                                                        
##  median_house_value ocean_proximity   
##  Min.   : 14999     Length:20640      
##  1st Qu.:119600     Class :character  
##  Median :179700     Mode  :character  
##  Mean   :206856                       
##  3rd Qu.:264725                       
##  Max.   :500001                       
## 
str(carlifonia)
## 'data.frame':    20640 obs. of  10 variables:
##  $ longitude         : num  -122 -122 -122 -122 -122 ...
##  $ latitude          : num  37.9 37.9 37.9 37.9 37.9 ...
##  $ housing_median_age: num  41 21 52 52 52 52 52 52 42 52 ...
##  $ total_rooms       : num  880 7099 1467 1274 1627 ...
##  $ total_bedrooms    : num  129 1106 190 235 280 ...
##  $ population        : num  322 2401 496 558 565 ...
##  $ households        : num  126 1138 177 219 259 ...
##  $ median_income     : num  8.33 8.3 7.26 5.64 3.85 ...
##  $ median_house_value: num  452600 358500 352100 341300 342200 ...
##  $ ocean_proximity   : chr  "NEAR BAY" "NEAR BAY" "NEAR BAY" "NEAR BAY" ...

Data cleaning and preprocessing

First we need to remove all unwanted or incomplete data records from the dataset for consistent exploration. These include null values or incorrect formated values. From that the data will be ready for exploration and visualization.

options(repr.plot.width=8, repr.plot.height=3)
# look for missing values using the DataExplorer package
plot_missing(carlifonia)

## The data is clean and ready for use. There is no missing values in the dataset features.

#To plot scatter plot
pairs(carlifonia[1:9])

#To identify Correlation
cor(carlifonia[1:9])
##                      longitude    latitude housing_median_age total_rooms
## longitude           1.00000000 -0.92466443        -0.10819681  0.04456798
## latitude           -0.92466443  1.00000000         0.01117267 -0.03609960
## housing_median_age -0.10819681  0.01117267         1.00000000 -0.36126220
## total_rooms         0.04456798 -0.03609960        -0.36126220  1.00000000
## total_bedrooms              NA          NA                 NA          NA
## population          0.09977322 -0.10878475        -0.29624424  0.85712597
## households          0.05531009 -0.07103543        -0.30291601  0.91848449
## median_income      -0.01517587 -0.07980913        -0.11903399  0.19804965
## median_house_value -0.04596662 -0.14416028         0.10562341  0.13415311
##                    total_bedrooms   population  households median_income
## longitude                      NA  0.099773223  0.05531009  -0.015175865
## latitude                       NA -0.108784747 -0.07103543  -0.079809127
## housing_median_age             NA -0.296244240 -0.30291601  -0.119033990
## total_rooms                    NA  0.857125973  0.91848449   0.198049645
## total_bedrooms                  1           NA          NA            NA
## population                     NA  1.000000000  0.90722227   0.004834346
## households                     NA  0.907222266  1.00000000   0.013033052
## median_income                  NA  0.004834346  0.01303305   1.000000000
## median_house_value             NA -0.024649679  0.06584265   0.688075208
##                    median_house_value
## longitude                 -0.04596662
## latitude                  -0.14416028
## housing_median_age         0.10562341
## total_rooms                0.13415311
## total_bedrooms                     NA
## population                -0.02464968
## households                 0.06584265
## median_income              0.68807521
## median_house_value         1.00000000
#To remove Null Values
table(is.na(carlifonia$total_bedrooms))
## 
## FALSE  TRUE 
## 20433   207
carlifonia$total_bedrooms[is.na(carlifonia$total_bedrooms)]=median(carlifonia$total_bedrooms,na.rm = TRUE)
table(is.na(carlifonia$total_bedrooms))
## 
## FALSE 
## 20640
#TO handle Categorical Variables
carlifonia$ocean_proximity=as.numeric(factor(carlifonia$ocean_proximity,
                                     levels = c('<1H OCEAN','INLAND','ISLAND','NEAR BAY', 'NEAR OCEAN'),
                                    labels = c(1,2,3,4,5)))
head(carlifonia$ocean_proximity)
## [1] 4 4 4 4 4 4
#TO handle Categorical Variables
carlifonia$ocean_proximity=as.numeric(factor(carlifonia$ocean_proximity,
                                     levels = c('<1H OCEAN','INLAND','ISLAND','NEAR BAY', 'NEAR OCEAN'),
                                    labels = c(1,2,3,4,5)))


#create histogram of values for price
ggplot(data=carlifonia, aes(x=median_house_value)) +
  geom_histogram(fill="steelblue", color="black") +
  ggtitle("Histogram of Median House Values")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#create scatterplot of median house value, grouped by ocean proximity
#ggplot(data=carlifonia, aes(x=ocean_proximity, y=median_house_value)) + 
#  geom_boxplot(fill="steelblue")
#create scatterplot of total_rooms vs. median_house_value, using ocean_proximity as color variable
ggplot(data=carlifonia, aes(x=total_rooms, y=median_house_value, color=ocean_proximity)) + 
  geom_point()