Boston_Housing and Swedish_Auto

Boston=read.csv("/Users/maxineharlemon/AIOpt/Boston-house-price-data.csv", header = TRUE)
library(Hmisc)

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)
library(mlbench)
library(DataExplorer)
library(corrplot)

## corrplot 0.95 loaded

dim(Boston) #The data set has 504 rowas and 14 columns

## [1] 506  14

names(Boston) #The names of the columns are: crim, zn, indus,chas, nox, age, dis, rad, tax, ptratio,

##  [1] "CRIM"    "ZN"      "INDUS"   "CHAS"    "NOX"     "RM"      "AGE"    
##  [8] "DIS"     "RAD"     "TAX"     "PTRATIO" "B"       "LSTAT"   "MEDV"

#Variable descriptions are as follows:
#crim: Per capita crime rate by town.
#zn: Proportion of residential land zoned for large lots (over 25,000 sq. ft.).
#indus: Proportion of non-retail business acres per town.
#chas: Charles River dummy variable (1 if tract bounds river; 0 otherwise).
#nox: Nitric oxides concentration (parts per 10 million).
#rm: Average number of rooms per dwelling.
#age: Proportion of owner-occupied units built prior to 1940.
#dis: Weighted distances to five Boston employment centers.
#rad: Index of accessibility to radial highways.
#tax: Full-value property-tax rate per $10,000.
#ptratio: pupil-teacher ratio by town
#b: 1000(B−0.63)^2, where B is the proportion of blacks by town 
#lstat: percentage of lower status of the population
#medv: median value of owner-occupied homes in USD 1000's
#This dataset provides valuable insights into the housing market dynamics and socio-economic factors influencing property values in Boston.Housing data for 506 census tracts of Boston from the 1970 census. The dataframe BostonHousing contains the original data by Harrison and Rubinfeld (1979), the dataframe BostonHousing2 the corrected version with additional spatial information (see references below).
head(Boston, n=10)

##       CRIM   ZN INDUS CHAS   NOX    RM   AGE    DIS RAD TAX PTRATIO      B
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3 396.90
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2 242    17.8 396.90
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2 242    17.8 392.83
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3 222    18.7 394.63
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7 396.90
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3 222    18.7 394.12
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 395.60
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 396.90
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 386.63
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.5921   5 311    15.2 386.71
##    LSTAT MEDV
## 1   4.98 24.0
## 2   9.14 21.6
## 3   4.03 34.7
## 4   2.94 33.4
## 5   5.33 36.2
## 6   5.21 28.7
## 7  12.43 22.9
## 8  19.15 27.1
## 9  29.93 16.5
## 10 17.10 18.9

Boston_Cor_Matrix <- cor(Boston) 
print(Boston_Cor_Matrix)

##                CRIM          ZN       INDUS         CHAS         NOX
## CRIM     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## ZN      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## INDUS    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## CHAS    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## NOX      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## RM      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## AGE      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## DIS     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## RAD      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## TAX      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## PTRATIO  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## B       -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## LSTAT    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## MEDV    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
##                  RM         AGE         DIS          RAD         TAX    PTRATIO
## CRIM    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## ZN       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## INDUS   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## CHAS     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## NOX     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## RM       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## AGE     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## DIS      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## RAD     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## TAX     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## PTRATIO -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## B        0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## LSTAT   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## MEDV     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
##                   B      LSTAT       MEDV
## CRIM    -0.38506394  0.4556215 -0.3883046
## ZN       0.17552032 -0.4129946  0.3604453
## INDUS   -0.35697654  0.6037997 -0.4837252
## CHAS     0.04878848 -0.0539293  0.1752602
## NOX     -0.38005064  0.5908789 -0.4273208
## RM       0.12806864 -0.6138083  0.6953599
## AGE     -0.27353398  0.6023385 -0.3769546
## DIS      0.29151167 -0.4969958  0.2499287
## RAD     -0.44441282  0.4886763 -0.3816262
## TAX     -0.44180801  0.5439934 -0.4685359
## PTRATIO -0.17738330  0.3740443 -0.5077867
## B        1.00000000 -0.3660869  0.3334608
## LSTAT   -0.36608690  1.0000000 -0.7376627
## MEDV     0.33346082 -0.7376627  1.0000000

glimpse(Boston)

## Rows: 506
## Columns: 14
## $ CRIM    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ ZN      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ INDUS   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ CHAS    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ NOX     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ RM      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ AGE     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ DIS     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ RAD     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ TAX     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ PTRATIO <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ B       <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ LSTAT   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ MEDV    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…

summary(Boston$CRIM)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08205  0.25651  3.61352  3.67708 88.97620

summary(Boston$ZN)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   11.36   12.50  100.00

summary(Boston$INDUS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.46    5.19    9.69   11.14   18.10   27.74

summary(Boston$CHAS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06917 0.00000 1.00000

summary(Boston$NOX)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3850  0.4490  0.5380  0.5547  0.6240  0.8710

summary(Boston$RM)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.561   5.886   6.208   6.285   6.623   8.780

summary(Boston$AGE)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.90   45.02   77.50   68.57   94.08  100.00

summary(Boston$DIS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.130   2.100   3.207   3.795   5.188  12.127

summary(Boston$RAD)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   5.000   9.549  24.000  24.000

correlation=cor(select(Boston, -CHAS))
corrplot.mixed(correlation, order="AOE", title="Correlation plot")

ggpairs(Boston, columns = 1:7, title = "Pair Plot Boston dataset")

ggpairs(Boston, columns = 8:14, title = "Pair Plot Boston dataset 1-14")

ggparcoord(Boston, columns = c(1:3, 5:14), groupColumn = "CHAS", title = "Parallel coordinate plot Boston House Price")

Swedish=read.csv("/Users/maxineharlemon/AIOpt/swedish_insurance.csv", header = TRUE)
dim(Swedish)

## [1] 63  2

names(Swedish)

## [1] "X" "Y"

#The Swedish data set contains the following data
#X= number of claims
#Y= total payment for all the claims in thousands of Swedish Kronor for geographical zones in Sweden
head(Swedish)

##     X     Y
## 1 108 392.5
## 2  19  46.2
## 3  13  15.7
## 4 124 422.2
## 5  40 119.4
## 6  57 170.9

Swedish_Cor_Matrix=cor(Swedish)
print(Swedish_Cor_Matrix)

##           X         Y
## X 1.0000000 0.9128782
## Y 0.9128782 1.0000000

glimpse(Swedish)

## Rows: 63
## Columns: 2
## $ X <int> 108, 19, 13, 124, 40, 57, 23, 14, 45, 10, 5, 48, 11, 23, 7, 2, 24, 6…
## $ Y <dbl> 392.5, 46.2, 15.7, 422.2, 119.4, 170.9, 56.9, 77.5, 214.0, 65.3, 20.…

summary(Swedish$X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.5    14.0    22.9    29.0   124.0

summary(Swedish$Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   38.85   73.40   98.19  140.00  422.20

Swedish_pruned=Swedish[-c(31),]
barplot(height = Swedish$Y, names.arg = Swedish$X, col = "skyblue", main = "Barplot of Payments by claims", xlab = "Number of Claims", ylab = "Payments")

ggplot(Swedish_pruned, aes(x=X, y=Y)) + geom_bar(stat = "identity") + xlab("Number of Claims") + ylab("Total Payments")+ggtitle("Totla Payments by Claims")

#The Swedish data show that the people with the highes number of claims paid the most payments

Boston_Housing and Swedish_Auto_insurance

Dr. Maxine Harlemon

2025-07-07