Boston=read.csv("/Users/maxineharlemon/AIOpt/Boston-house-price-data.csv", header = TRUE)
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(mlbench)
library(DataExplorer)
library(corrplot)
## corrplot 0.95 loaded
dim(Boston) #The data set has 504 rowas and 14 columns
## [1] 506 14
names(Boston) #The names of the columns are: crim, zn, indus,chas, nox, age, dis, rad, tax, ptratio,
## [1] "CRIM" "ZN" "INDUS" "CHAS" "NOX" "RM" "AGE"
## [8] "DIS" "RAD" "TAX" "PTRATIO" "B" "LSTAT" "MEDV"
#Variable descriptions are as follows:
#crim: Per capita crime rate by town.
#zn: Proportion of residential land zoned for large lots (over 25,000 sq. ft.).
#indus: Proportion of non-retail business acres per town.
#chas: Charles River dummy variable (1 if tract bounds river; 0 otherwise).
#nox: Nitric oxides concentration (parts per 10 million).
#rm: Average number of rooms per dwelling.
#age: Proportion of owner-occupied units built prior to 1940.
#dis: Weighted distances to five Boston employment centers.
#rad: Index of accessibility to radial highways.
#tax: Full-value property-tax rate per $10,000.
#ptratio: pupil-teacher ratio by town
#b: 1000(B−0.63)^2, where B is the proportion of blacks by town
#lstat: percentage of lower status of the population
#medv: median value of owner-occupied homes in USD 1000's
#This dataset provides valuable insights into the housing market dynamics and socio-economic factors influencing property values in Boston.Housing data for 506 census tracts of Boston from the 1970 census. The dataframe BostonHousing contains the original data by Harrison and Rubinfeld (1979), the dataframe BostonHousing2 the corrected version with additional spatial information (see references below).
head(Boston, n=10)
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 395.60
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 396.90
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 386.63
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71
## LSTAT MEDV
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
## 7 12.43 22.9
## 8 19.15 27.1
## 9 29.93 16.5
## 10 17.10 18.9
Boston_Cor_Matrix <- cor(Boston)
print(Boston_Cor_Matrix)
## CRIM ZN INDUS CHAS NOX
## CRIM 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## ZN -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## INDUS 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## CHAS -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## NOX 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## RM -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## AGE 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## DIS -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## RAD 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## TAX 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## PTRATIO 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## B -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## LSTAT 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## MEDV -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## RM AGE DIS RAD TAX PTRATIO
## CRIM -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## ZN 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## INDUS -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## CHAS 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## NOX -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## RM 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## AGE -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## DIS 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## RAD -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## TAX -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## PTRATIO -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## B 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## LSTAT -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## MEDV 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## B LSTAT MEDV
## CRIM -0.38506394 0.4556215 -0.3883046
## ZN 0.17552032 -0.4129946 0.3604453
## INDUS -0.35697654 0.6037997 -0.4837252
## CHAS 0.04878848 -0.0539293 0.1752602
## NOX -0.38005064 0.5908789 -0.4273208
## RM 0.12806864 -0.6138083 0.6953599
## AGE -0.27353398 0.6023385 -0.3769546
## DIS 0.29151167 -0.4969958 0.2499287
## RAD -0.44441282 0.4886763 -0.3816262
## TAX -0.44180801 0.5439934 -0.4685359
## PTRATIO -0.17738330 0.3740443 -0.5077867
## B 1.00000000 -0.3660869 0.3334608
## LSTAT -0.36608690 1.0000000 -0.7376627
## MEDV 0.33346082 -0.7376627 1.0000000
glimpse(Boston)
## Rows: 506
## Columns: 14
## $ CRIM <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ ZN <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ INDUS <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ CHAS <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ NOX <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ RM <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ AGE <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ DIS <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ RAD <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ TAX <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ PTRATIO <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ B <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ LSTAT <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ MEDV <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
summary(Boston$CRIM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08205 0.25651 3.61352 3.67708 88.97620
summary(Boston$ZN)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 11.36 12.50 100.00
summary(Boston$INDUS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.46 5.19 9.69 11.14 18.10 27.74
summary(Boston$CHAS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06917 0.00000 1.00000
summary(Boston$NOX)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3850 0.4490 0.5380 0.5547 0.6240 0.8710
summary(Boston$RM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.561 5.886 6.208 6.285 6.623 8.780
summary(Boston$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.90 45.02 77.50 68.57 94.08 100.00
summary(Boston$DIS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.130 2.100 3.207 3.795 5.188 12.127
summary(Boston$RAD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 5.000 9.549 24.000 24.000
correlation=cor(select(Boston, -CHAS))
corrplot.mixed(correlation, order="AOE", title="Correlation plot")
ggpairs(Boston, columns = 1:7, title = "Pair Plot Boston dataset")
ggpairs(Boston, columns = 8:14, title = "Pair Plot Boston dataset 1-14")
ggparcoord(Boston, columns = c(1:3, 5:14), groupColumn = "CHAS", title = "Parallel coordinate plot Boston House Price")
Swedish=read.csv("/Users/maxineharlemon/AIOpt/swedish_insurance.csv", header = TRUE)
dim(Swedish)
## [1] 63 2
names(Swedish)
## [1] "X" "Y"
#The Swedish data set contains the following data
#X= number of claims
#Y= total payment for all the claims in thousands of Swedish Kronor for geographical zones in Sweden
head(Swedish)
## X Y
## 1 108 392.5
## 2 19 46.2
## 3 13 15.7
## 4 124 422.2
## 5 40 119.4
## 6 57 170.9
Swedish_Cor_Matrix=cor(Swedish)
print(Swedish_Cor_Matrix)
## X Y
## X 1.0000000 0.9128782
## Y 0.9128782 1.0000000
glimpse(Swedish)
## Rows: 63
## Columns: 2
## $ X <int> 108, 19, 13, 124, 40, 57, 23, 14, 45, 10, 5, 48, 11, 23, 7, 2, 24, 6…
## $ Y <dbl> 392.5, 46.2, 15.7, 422.2, 119.4, 170.9, 56.9, 77.5, 214.0, 65.3, 20.…
summary(Swedish$X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 7.5 14.0 22.9 29.0 124.0
summary(Swedish$Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 38.85 73.40 98.19 140.00 422.20
Swedish_pruned=Swedish[-c(31),]
barplot(height = Swedish$Y, names.arg = Swedish$X, col = "skyblue", main = "Barplot of Payments by claims", xlab = "Number of Claims", ylab = "Payments")
ggplot(Swedish_pruned, aes(x=X, y=Y)) + geom_bar(stat = "identity") + xlab("Number of Claims") + ylab("Total Payments")+ggtitle("Totla Payments by Claims")
#The Swedish data show that the people with the highes number of claims paid the most payments