Boston=read.csv("/Users/maxineharlemon/AIOpt/Boston-house-price-data.csv", header = TRUE)
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(mlbench)
library(DataExplorer)
library(corrplot)
## corrplot 0.95 loaded
dim(Boston) #The data set has 504 rowas and 14 columns
## [1] 506  14
names(Boston) #The names of the columns are: crim, zn, indus,chas, nox, age, dis, rad, tax, ptratio, 
##  [1] "CRIM"    "ZN"      "INDUS"   "CHAS"    "NOX"     "RM"      "AGE"    
##  [8] "DIS"     "RAD"     "TAX"     "PTRATIO" "B"       "LSTAT"   "MEDV"
#Variable descriptions are as follows:
#crim: Per capita crime rate by town.
#zn: Proportion of residential land zoned for large lots (over 25,000 sq. ft.).
#indus: Proportion of non-retail business acres per town.
#chas: Charles River dummy variable (1 if tract bounds river; 0 otherwise).
#nox: Nitric oxides concentration (parts per 10 million).
#rm: Average number of rooms per dwelling.
#age: Proportion of owner-occupied units built prior to 1940.
#dis: Weighted distances to five Boston employment centers.
#rad: Index of accessibility to radial highways.
#tax: Full-value property-tax rate per $10,000.
#ptratio: pupil-teacher ratio by town
#b: 1000(B−0.63)^2, where B is the proportion of blacks by town 
#lstat: percentage of lower status of the population
#medv: median value of owner-occupied homes in USD 1000's
#This dataset provides valuable insights into the housing market dynamics and socio-economic factors influencing property values in Boston.Housing data for 506 census tracts of Boston from the 1970 census. The dataframe BostonHousing contains the original data by Harrison and Rubinfeld (1979), the dataframe BostonHousing2 the corrected version with additional spatial information (see references below).
head(Boston, n=10)
##       CRIM   ZN INDUS CHAS   NOX    RM   AGE    DIS RAD TAX PTRATIO      B
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3 396.90
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2 242    17.8 396.90
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2 242    17.8 392.83
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3 222    18.7 394.63
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7 396.90
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3 222    18.7 394.12
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 395.60
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 396.90
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 386.63
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.5921   5 311    15.2 386.71
##    LSTAT MEDV
## 1   4.98 24.0
## 2   9.14 21.6
## 3   4.03 34.7
## 4   2.94 33.4
## 5   5.33 36.2
## 6   5.21 28.7
## 7  12.43 22.9
## 8  19.15 27.1
## 9  29.93 16.5
## 10 17.10 18.9
Boston_Cor_Matrix <- cor(Boston) 
print(Boston_Cor_Matrix)
##                CRIM          ZN       INDUS         CHAS         NOX
## CRIM     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## ZN      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## INDUS    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## CHAS    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## NOX      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## RM      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## AGE      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## DIS     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## RAD      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## TAX      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## PTRATIO  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## B       -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## LSTAT    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## MEDV    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
##                  RM         AGE         DIS          RAD         TAX    PTRATIO
## CRIM    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## ZN       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## INDUS   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## CHAS     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## NOX     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## RM       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## AGE     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## DIS      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## RAD     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## TAX     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## PTRATIO -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## B        0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## LSTAT   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## MEDV     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
##                   B      LSTAT       MEDV
## CRIM    -0.38506394  0.4556215 -0.3883046
## ZN       0.17552032 -0.4129946  0.3604453
## INDUS   -0.35697654  0.6037997 -0.4837252
## CHAS     0.04878848 -0.0539293  0.1752602
## NOX     -0.38005064  0.5908789 -0.4273208
## RM       0.12806864 -0.6138083  0.6953599
## AGE     -0.27353398  0.6023385 -0.3769546
## DIS      0.29151167 -0.4969958  0.2499287
## RAD     -0.44441282  0.4886763 -0.3816262
## TAX     -0.44180801  0.5439934 -0.4685359
## PTRATIO -0.17738330  0.3740443 -0.5077867
## B        1.00000000 -0.3660869  0.3334608
## LSTAT   -0.36608690  1.0000000 -0.7376627
## MEDV     0.33346082 -0.7376627  1.0000000
glimpse(Boston)
## Rows: 506
## Columns: 14
## $ CRIM    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ ZN      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ INDUS   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ CHAS    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ NOX     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ RM      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ AGE     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ DIS     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ RAD     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ TAX     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ PTRATIO <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ B       <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ LSTAT   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ MEDV    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
summary(Boston$CRIM)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08205  0.25651  3.61352  3.67708 88.97620
summary(Boston$ZN)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   11.36   12.50  100.00
summary(Boston$INDUS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.46    5.19    9.69   11.14   18.10   27.74
summary(Boston$CHAS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06917 0.00000 1.00000
summary(Boston$NOX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3850  0.4490  0.5380  0.5547  0.6240  0.8710
summary(Boston$RM)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.561   5.886   6.208   6.285   6.623   8.780
summary(Boston$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.90   45.02   77.50   68.57   94.08  100.00
summary(Boston$DIS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.130   2.100   3.207   3.795   5.188  12.127
summary(Boston$RAD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   5.000   9.549  24.000  24.000
correlation=cor(select(Boston, -CHAS))
corrplot.mixed(correlation, order="AOE", title="Correlation plot")

ggpairs(Boston, columns = 1:7, title = "Pair Plot Boston dataset")

ggpairs(Boston, columns = 8:14, title = "Pair Plot Boston dataset 1-14")

ggparcoord(Boston, columns = c(1:3, 5:14), groupColumn = "CHAS", title = "Parallel coordinate plot Boston House Price")

Swedish=read.csv("/Users/maxineharlemon/AIOpt/swedish_insurance.csv", header = TRUE)
dim(Swedish)
## [1] 63  2
names(Swedish)
## [1] "X" "Y"
#The Swedish data set contains the following data
#X= number of claims
#Y= total payment for all the claims in thousands of Swedish Kronor for geographical zones in Sweden
head(Swedish)
##     X     Y
## 1 108 392.5
## 2  19  46.2
## 3  13  15.7
## 4 124 422.2
## 5  40 119.4
## 6  57 170.9
Swedish_Cor_Matrix=cor(Swedish)
print(Swedish_Cor_Matrix)
##           X         Y
## X 1.0000000 0.9128782
## Y 0.9128782 1.0000000
glimpse(Swedish)
## Rows: 63
## Columns: 2
## $ X <int> 108, 19, 13, 124, 40, 57, 23, 14, 45, 10, 5, 48, 11, 23, 7, 2, 24, 6…
## $ Y <dbl> 392.5, 46.2, 15.7, 422.2, 119.4, 170.9, 56.9, 77.5, 214.0, 65.3, 20.…
summary(Swedish$X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     7.5    14.0    22.9    29.0   124.0
summary(Swedish$Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   38.85   73.40   98.19  140.00  422.20
Swedish_pruned=Swedish[-c(31),]
barplot(height = Swedish$Y, names.arg = Swedish$X, col = "skyblue", main = "Barplot of Payments by claims", xlab = "Number of Claims", ylab = "Payments")

ggplot(Swedish_pruned, aes(x=X, y=Y)) + geom_bar(stat = "identity") + xlab("Number of Claims") + ylab("Total Payments")+ggtitle("Totla Payments by Claims")

#The Swedish data show that the people with the highes number of claims paid the most payments