load("C:/Desktop/RWTH Aachen/Semester 1/Data & Information Management/Project/data_128.RData")
head(county_leukemia_128)
##   id leuk_cases     AverageAge GreenNeighborhoodIndex  CarsPerHH    PopDens
## 1  1         28   50 and older             0.55333137 0.62988964 0.15221254
## 2  2         38   50 and older             0.09487596 0.20972911 0.16145311
## 3  3         36          15-50             0.51076218 0.02426262 0.02274151
## 4  4         35   50 and older             0.22416091 0.34061953 0.16278211
## 5  5         37 15 and younger             0.30390560 0.51989769 0.33524316
## 6  6         36          15-50             0.21691189 0.78509375 0.39659359
##   PctPopGrowth Dist2FracLoc_1 Dist2FracLoc_2 Dist2FracLoc_3 Dist2FracLoc_4
## 1  -0.80870577       5.831941       3.780389       3.877897       6.292500
## 2   0.07389212       8.909646       2.369199       2.870503       9.151515
## 3  -0.55979462       5.815866       2.429792       1.734616       5.740380
## 4   0.54392735       7.802678       2.594612       2.918953       8.116144
## 5  -0.59737725       9.109335       2.054256       2.561358       9.321347
## 6  -2.62724807       6.980251       4.158250       4.568718       7.307248
##   Dist2FracLoc_5 Dist2FracLoc_6 Dist2FracLoc_7 Dist2FracLoc_8 Dist2FracLoc_9
## 1      2.6148888      4.8785487      0.6207211       2.098640      0.2504166
## 2      0.7315699      5.5741146      2.8456412       4.370153      2.1724450
## 3      4.8405903      0.4680598      3.4471759       2.651246      4.0512372
## 4      1.3709540      4.9903627      1.7744627       3.236860      1.2334271
## 5      0.9848940      5.3232384      2.9753540       4.389665      2.4008013
## 6      1.6170904      6.6084237      2.3871396       4.160840      1.3183907
##   Dist2FracLoc_10      lon       lat
## 1        2.629392 44.93403 -29.47574
## 2        5.208164 89.50712 -18.58449
## 3        2.857167 37.06287  32.51696
## 4        4.011806 73.29633 -20.26620
## 5        5.234740 90.39685 -12.98208
## 6        4.807845 69.74439 -48.33941
##                                                                                                                                                         geometry
## 1                                   44.87360, 47.24555, 46.32829, 44.13322, 42.49227, 44.87360, -24.49018, -31.99073, -33.89176, -33.34422, -28.11933, -24.49018
## 2              93.49703, 91.80541, 90.59801, 86.13883, 89.74325, 91.16961, 93.49703, -17.13538, -22.62803, -22.48408, -18.64491, -15.75014, -15.97666, -17.13538
## 3 40.59488, 39.87425, 38.44133, 36.42051, 35.23348, 36.26135, 36.66375, 40.59488, 30.47198, 28.94620, 27.27499, 26.82128, 26.65704, 35.66801, 35.47690, 30.47198
## 4              78.90321, 78.49617, 73.96722, 69.78430, 69.98401, 73.80859, 78.90321, -19.58987, -21.00295, -23.39622, -19.31405, -18.89552, -15.54791, -19.58987
## 5                           91.518933, 91.169609, 89.743250, 85.631455, 89.920254, 91.518933, -9.250811, -15.976659, -15.750137, -9.768976, -7.570804, -9.250811
## 6                                                        78.81867, 79.98972, 61.54178, 69.45060, 78.81867, -50.34643, -59.63708, -59.63708, -40.97035, -50.34643

Check for missing values

sapply(county_leukemia_128, function(x) sum(is.na(x)))
##                     id             leuk_cases             AverageAge 
##                      0                      0                      0 
## GreenNeighborhoodIndex              CarsPerHH                PopDens 
##                      0                      0                      0 
##           PctPopGrowth         Dist2FracLoc_1         Dist2FracLoc_2 
##                      0                      0                      0 
##         Dist2FracLoc_3         Dist2FracLoc_4         Dist2FracLoc_5 
##                      0                      0                      0 
##         Dist2FracLoc_6         Dist2FracLoc_7         Dist2FracLoc_8 
##                      0                      0                      0 
##         Dist2FracLoc_9        Dist2FracLoc_10                    lon 
##                      0                      0                      0 
##                    lat               geometry 
##                      0                      0

#Create a new dataset with the data from the “county_leukemia_128” dataset for a better overview.

data <- county_leukemia_128[, c("leuk_cases", "AverageAge", "GreenNeighborhoodIndex", "CarsPerHH", "PopDens", "PctPopGrowth", "Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4", "Dist2FracLoc_5", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8", "Dist2FracLoc_9", "Dist2FracLoc_10")]

1. Descriptive statistics

summary(data)
##    leuk_cases             AverageAge  GreenNeighborhoodIndex   CarsPerHH       
##  Min.   :23.00   15 and younger:186   Min.   :0.0002333      Min.   :0.001174  
##  1st Qu.:32.00   15-50         :167   1st Qu.:0.2346412      1st Qu.:0.256101  
##  Median :35.00   50 and older  :147   Median :0.4859838      Median :0.502197  
##  Mean   :36.68                        Mean   :0.4862802      Mean   :0.497339  
##  3rd Qu.:41.00                        3rd Qu.:0.7251946      3rd Qu.:0.749654  
##  Max.   :52.00                        Max.   :0.9967076      Max.   :0.999106  
##     PopDens           PctPopGrowth     Dist2FracLoc_1   Dist2FracLoc_2  
##  Min.   :0.0000152   Min.   :-6.9749   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:0.0600216   1st Qu.:-1.4347   1st Qu.: 2.938   1st Qu.: 2.986  
##  Median :0.1753587   Median :-0.2529   Median : 4.128   Median : 4.690  
##  Mean   :0.2427995   Mean   :-0.4161   Mean   : 4.626   Mean   : 4.835  
##  3rd Qu.:0.3783928   3rd Qu.: 0.6439   3rd Qu.: 6.410   3rd Qu.: 6.770  
##  Max.   :0.9666323   Max.   : 4.9987   Max.   :10.000   Max.   :10.000  
##  Dist2FracLoc_3   Dist2FracLoc_4   Dist2FracLoc_5   Dist2FracLoc_6  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 2.868   1st Qu.: 2.547   1st Qu.: 3.373   1st Qu.: 3.075  
##  Median : 4.476   Median : 3.938   Median : 5.260   Median : 4.846  
##  Mean   : 4.587   Mean   : 4.566   Mean   : 5.245   Mean   : 4.760  
##  3rd Qu.: 6.374   3rd Qu.: 6.627   3rd Qu.: 7.139   3rd Qu.: 6.249  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  Dist2FracLoc_7   Dist2FracLoc_8   Dist2FracLoc_9   Dist2FracLoc_10 
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 2.477   1st Qu.: 2.759   1st Qu.: 2.426   1st Qu.: 3.043  
##  Median : 4.010   Median : 4.278   Median : 4.217   Median : 4.639  
##  Mean   : 4.433   Mean   : 4.608   Mean   : 4.443   Mean   : 4.771  
##  3rd Qu.: 6.161   3rd Qu.: 6.123   3rd Qu.: 6.129   3rd Qu.: 6.124  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000

Or use psych package for overview.

library(psych)
describe(data)
##                        vars   n  mean   sd median trimmed  mad   min   max
## leuk_cases                1 500 36.68 6.03  35.00   36.33 5.93 23.00 52.00
## AverageAge*               2 500  1.92 0.81   2.00    1.90 1.48  1.00  3.00
## GreenNeighborhoodIndex    3 500  0.49 0.29   0.49    0.49 0.36  0.00  1.00
## CarsPerHH                 4 500  0.50 0.28   0.50    0.50 0.37  0.00  1.00
## PopDens                   5 500  0.24 0.21   0.18    0.22 0.20  0.00  0.97
## PctPopGrowth              6 500 -0.42 1.79  -0.25   -0.34 1.52 -6.97  5.00
## Dist2FracLoc_1            7 500  4.63 2.46   4.13    4.52 2.29  0.00 10.00
## Dist2FracLoc_2            8 500  4.84 2.43   4.69    4.81 2.74  0.00 10.00
## Dist2FracLoc_3            9 500  4.59 2.36   4.48    4.54 2.61  0.00 10.00
## Dist2FracLoc_4           10 500  4.57 2.57   3.94    4.46 2.62  0.00 10.00
## Dist2FracLoc_5           11 500  5.24 2.52   5.26    5.26 2.80  0.00 10.00
## Dist2FracLoc_6           12 500  4.76 2.19   4.85    4.74 2.29  0.00 10.00
## Dist2FracLoc_7           13 500  4.43 2.42   4.01    4.31 2.57  0.00 10.00
## Dist2FracLoc_8           14 500  4.61 2.36   4.28    4.49 2.39  0.00 10.00
## Dist2FracLoc_9           15 500  4.44 2.44   4.22    4.35 2.75  0.00 10.00
## Dist2FracLoc_10          16 500  4.77 2.30   4.64    4.68 2.34  0.00 10.00
##                        range  skew kurtosis   se
## leuk_cases             29.00  0.49    -0.54 0.27
## AverageAge*             2.00  0.14    -1.48 0.04
## GreenNeighborhoodIndex  1.00  0.01    -1.21 0.01
## CarsPerHH               1.00 -0.01    -1.20 0.01
## PopDens                 0.97  0.98     0.24 0.01
## PctPopGrowth           11.97 -0.41     0.63 0.08
## Dist2FracLoc_1         10.00  0.42    -0.72 0.11
## Dist2FracLoc_2         10.00  0.09    -0.85 0.11
## Dist2FracLoc_3         10.00  0.16    -0.68 0.11
## Dist2FracLoc_4         10.00  0.37    -0.95 0.11
## Dist2FracLoc_5         10.00 -0.02    -0.91 0.11
## Dist2FracLoc_6         10.00  0.04    -0.58 0.10
## Dist2FracLoc_7         10.00  0.42    -0.83 0.11
## Dist2FracLoc_8         10.00  0.43    -0.69 0.11
## Dist2FracLoc_9         10.00  0.30    -0.85 0.11
## Dist2FracLoc_10        10.00  0.31    -0.68 0.10

##Correlation analysis

library(dplyr)
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
data$AverageAge <- as.numeric(recode(data$AverageAge, "15 and younger" = 10, "15-50" = 32.5, "50 and older" = 65))
cor_matrix <- cor(data)
print(cor_matrix)
##                          leuk_cases   AverageAge GreenNeighborhoodIndex
## leuk_cases              1.000000000  0.001595315             0.01254171
## AverageAge              0.001595315  1.000000000             0.08096118
## GreenNeighborhoodIndex  0.012541706  0.080961179             1.00000000
## CarsPerHH              -0.029098402 -0.001296003            -0.01262389
## PopDens                 0.002928462  0.012791776            -0.03060715
## PctPopGrowth            0.263413021  0.023532482            -0.01839072
## Dist2FracLoc_1         -0.303019494  0.041464072            -0.07193474
## Dist2FracLoc_2          0.667470299 -0.069501465             0.09406828
## Dist2FracLoc_3          0.665163353 -0.077272975             0.09215445
## Dist2FracLoc_4         -0.451915924  0.047496920            -0.07584284
## Dist2FracLoc_5          0.609088090 -0.055065381             0.07312433
## Dist2FracLoc_6          0.648532736 -0.098655144             0.06787217
## Dist2FracLoc_7          0.872528734 -0.082516591             0.07325840
## Dist2FracLoc_8          0.928650786 -0.099445337             0.06745598
## Dist2FracLoc_9          0.789691764 -0.066896478             0.07188707
## Dist2FracLoc_10         0.912249525 -0.096761561             0.05830868
##                           CarsPerHH       PopDens PctPopGrowth Dist2FracLoc_1
## leuk_cases             -0.029098402  2.928462e-03  0.263413021    -0.30301949
## AverageAge             -0.001296003  1.279178e-02  0.023532482     0.04146407
## GreenNeighborhoodIndex -0.012623888 -3.060715e-02 -0.018390720    -0.07193474
## CarsPerHH               1.000000000  6.538893e-01 -0.070490098    -0.12691925
## PopDens                 0.653889292  1.000000e+00 -0.007184758    -0.05856997
## PctPopGrowth           -0.070490098 -7.184758e-03  1.000000000     0.11664700
## Dist2FracLoc_1         -0.126919253 -5.856997e-02  0.116647004     1.00000000
## Dist2FracLoc_2          0.075872336  5.449307e-02  0.023886772    -0.81212328
## Dist2FracLoc_3          0.068587914  5.626745e-02  0.045493539    -0.73601885
## Dist2FracLoc_4         -0.106674289 -5.375889e-02  0.063460900     0.97444279
## Dist2FracLoc_5          0.043008216  2.548995e-02 -0.004166902    -0.75789500
## Dist2FracLoc_6          0.012488537  4.300793e-02  0.210173960    -0.26970857
## Dist2FracLoc_7         -0.033819984  4.501406e-03  0.150458073    -0.44998740
## Dist2FracLoc_8         -0.042852013  1.090878e-02  0.204104607    -0.29970650
## Dist2FracLoc_9         -0.027035882  8.064044e-05  0.121025587    -0.49101979
## Dist2FracLoc_10        -0.059678938  6.422768e-03  0.243320308    -0.15812974
##                        Dist2FracLoc_2 Dist2FracLoc_3 Dist2FracLoc_4
## leuk_cases                 0.66747030     0.66516335    -0.45191592
## AverageAge                -0.06950146    -0.07727298     0.04749692
## GreenNeighborhoodIndex     0.09406828     0.09215445    -0.07584284
## CarsPerHH                  0.07587234     0.06858791    -0.10667429
## PopDens                    0.05449307     0.05626745    -0.05375889
## PctPopGrowth               0.02388677     0.04549354     0.06346090
## Dist2FracLoc_1            -0.81212328    -0.73601885     0.97444279
## Dist2FracLoc_2             1.00000000     0.98623389    -0.84654073
## Dist2FracLoc_3             0.98623389     1.00000000    -0.76023933
## Dist2FracLoc_4            -0.84654073    -0.76023933     1.00000000
## Dist2FracLoc_5             0.69535159     0.58670645    -0.87586048
## Dist2FracLoc_6             0.72825021     0.81983694    -0.29917426
## Dist2FracLoc_7             0.65899878     0.60352816    -0.62073901
## Dist2FracLoc_8             0.67005991     0.66758452    -0.46222452
## Dist2FracLoc_9             0.59730577     0.51465621    -0.66075023
## Dist2FracLoc_10            0.57899784     0.59363402    -0.32511914
##                        Dist2FracLoc_5 Dist2FracLoc_6 Dist2FracLoc_7
## leuk_cases                0.609088090     0.64853274    0.872528734
## AverageAge               -0.055065381    -0.09865514   -0.082516591
## GreenNeighborhoodIndex    0.073124334     0.06787217    0.073258402
## CarsPerHH                 0.043008216     0.01248854   -0.033819984
## PopDens                   0.025489952     0.04300793    0.004501406
## PctPopGrowth             -0.004166902     0.21017396    0.150458073
## Dist2FracLoc_1           -0.757894997    -0.26970857   -0.449987403
## Dist2FracLoc_2            0.695351595     0.72825021    0.658998775
## Dist2FracLoc_3            0.586706453     0.81983694    0.603528158
## Dist2FracLoc_4           -0.875860480    -0.29917426   -0.620739014
## Dist2FracLoc_5            1.000000000     0.17313755    0.844247465
## Dist2FracLoc_6            0.173137547     1.00000000    0.435114211
## Dist2FracLoc_7            0.844247465     0.43511421    1.000000000
## Dist2FracLoc_8            0.646717030     0.67227640    0.935019416
## Dist2FracLoc_9            0.906933491     0.26646655    0.978213426
## Dist2FracLoc_10           0.531716702     0.68459464    0.881637341
##                        Dist2FracLoc_8 Dist2FracLoc_9 Dist2FracLoc_10
## leuk_cases                 0.92865079   7.896918e-01     0.912249525
## AverageAge                -0.09944534  -6.689648e-02    -0.096761561
## GreenNeighborhoodIndex     0.06745598   7.188707e-02     0.058308683
## CarsPerHH                 -0.04285201  -2.703588e-02    -0.059678938
## PopDens                    0.01090878   8.064044e-05     0.006422768
## PctPopGrowth               0.20410461   1.210256e-01     0.243320308
## Dist2FracLoc_1            -0.29970650  -4.910198e-01    -0.158129736
## Dist2FracLoc_2             0.67005991   5.973058e-01     0.578997839
## Dist2FracLoc_3             0.66758452   5.146562e-01     0.593634022
## Dist2FracLoc_4            -0.46222452  -6.607502e-01    -0.325119144
## Dist2FracLoc_5             0.64671703   9.069335e-01     0.531716702
## Dist2FracLoc_6             0.67227640   2.664665e-01     0.684594636
## Dist2FracLoc_7             0.93501942   9.782134e-01     0.881637341
## Dist2FracLoc_8             1.00000000   8.446940e-01     0.987215938
## Dist2FracLoc_9             0.84469400   1.000000e+00     0.774873197
## Dist2FracLoc_10            0.98721594   7.748732e-01     1.000000000
library(corrplot)
## Warning: Paket 'corrplot' wurde unter R Version 4.4.2 erstellt
## corrplot 0.95 loaded
corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45, tl.cex = 0.5)

# 2. Plot your variables using maps and pairs plots Check coordinate system and transform if necessary

library(sf)
## Warning: Paket 'sf' wurde unter R Version 4.4.2 erstellt
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(ggplot2)
## Warning: Paket 'ggplot2' wurde unter R Version 4.4.2 erstellt
## 
## Attache Paket: 'ggplot2'
## Die folgenden Objekte sind maskiert von 'package:psych':
## 
##     %+%, alpha
st_crs(county_leukemia_128)
## Coordinate Reference System:
##   User input: EPSG:4326 
##   wkt:
## GEOGCRS["WGS 84",
##     ENSEMBLE["World Geodetic System 1984 ensemble",
##         MEMBER["World Geodetic System 1984 (Transit)"],
##         MEMBER["World Geodetic System 1984 (G730)"],
##         MEMBER["World Geodetic System 1984 (G873)"],
##         MEMBER["World Geodetic System 1984 (G1150)"],
##         MEMBER["World Geodetic System 1984 (G1674)"],
##         MEMBER["World Geodetic System 1984 (G1762)"],
##         MEMBER["World Geodetic System 1984 (G2139)"],
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]],
##         ENSEMBLEACCURACY[2.0]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["geodetic latitude (Lat)",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["geodetic longitude (Lon)",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     USAGE[
##         SCOPE["Horizontal component of 3D system."],
##         AREA["World."],
##         BBOX[-90,-180,90,180]],
##     ID["EPSG",4326]]
county_leukemia_128 <- st_transform(county_leukemia_128, crs = 4326)

Plot map

ggplot(data = county_leukemia_128) +
    geom_sf(aes(fill = leuk_cases), color = "black") +
     scale_fill_viridis_c(option = "plasma") +  # color
     labs(title = "Leuk_cases",
          fill = "Cases") +
     theme_minimal()

ggplot(data = county_leukemia_128) +
  geom_sf(aes(fill = GreenNeighborhoodIndex), color = "black") +
  scale_fill_gradient(low = "lightgreen", high = "darkgreen") +
  labs(title = "Green Neighborhood index by district", fill = "Green Index") +
  theme_minimal()

Population density (PopDens):

ggplot(data = county_leukemia_128) +
  geom_sf(aes(fill = PopDens), color = "black") +
  scale_fill_viridis_c(option = "inferno") +
  labs(title = "Population density", fill = "Density") +
  theme_minimal()

Pairs plots for analyzing correlations

Map by age group

ggplot(data = county_leukemia_128) +
  geom_sf(aes(fill = AverageAge), color = "black") +
  scale_fill_manual(values = c("15 and younger" = "lightblue", 
                               "15-50" = "orange", 
                               "50 and older" = "red")) +
  labs(title = "Distribution of the average age",
       fill = "Average Age") +
  theme_minimal()

Leukemia cases by age group

ggplot(county_leukemia_128, aes(x = AverageAge, y = leuk_cases, fill = AverageAge)) +
  geom_boxplot() +
  scale_fill_manual(values = c("15 and younger" = "lightblue", 
                               "15-50" = "orange", 
                               "50 and older" = "red")) +
  labs(title = "Leukemia cases by age group",
       x = "Average Age",
       y = "Leukemia cases") +
  theme_minimal()

library(GGally)
## Warning: Paket 'GGally' wurde unter R Version 4.4.2 erstellt
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(county_leukemia_128, 
        columns = c("leuk_cases", "GreenNeighborhoodIndex", "CarsPerHH", "PopDens", "PctPopGrowth"),
        aes(color = AverageAge, alpha = 0.7)) +
  labs(title = "Pairs plot for numeric variables by age group")

Visualization: Map with leukemia cases and fracking distances

# Extract only the distance columns
dist_columns <- grep("Dist2FracLoc", colnames(county_leukemia_128), value = TRUE)

# Check the structure of the distances (should be numerical)
head(county_leukemia_128[, dist_columns])
## Simple feature collection with 6 features and 10 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 35.23348 ymin: -59.63708 xmax: 93.49703 ymax: 35.66801
## Geodetic CRS:  WGS 84
##   Dist2FracLoc_1 Dist2FracLoc_2 Dist2FracLoc_3 Dist2FracLoc_4 Dist2FracLoc_5
## 1       5.831941       3.780389       3.877897       6.292500      2.6148888
## 2       8.909646       2.369199       2.870503       9.151515      0.7315699
## 3       5.815866       2.429792       1.734616       5.740380      4.8405903
## 4       7.802678       2.594612       2.918953       8.116144      1.3709540
## 5       9.109335       2.054256       2.561358       9.321347      0.9848940
## 6       6.980251       4.158250       4.568718       7.307248      1.6170904
##   Dist2FracLoc_6 Dist2FracLoc_7 Dist2FracLoc_8 Dist2FracLoc_9 Dist2FracLoc_10
## 1      4.8785487      0.6207211       2.098640      0.2504166        2.629392
## 2      5.5741146      2.8456412       4.370153      2.1724450        5.208164
## 3      0.4680598      3.4471759       2.651246      4.0512372        2.857167
## 4      4.9903627      1.7744627       3.236860      1.2334271        4.011806
## 5      5.3232384      2.9753540       4.389665      2.4008013        5.234740
## 6      6.6084237      2.3871396       4.160840      1.3183907        4.807845
##                         geometry
## 1 POLYGON ((44.8736 -24.49018...
## 2 POLYGON ((93.49703 -17.1353...
## 3 POLYGON ((40.59488 30.47198...
## 4 POLYGON ((78.90321 -19.5898...
## 5 POLYGON ((91.51893 -9.25081...
## 6 POLYGON ((78.81867 -50.3464...
# Calculate the minimum distance to a fracking site
county_leukemia_128$MinDist2Frac <- apply(county_leukemia_128[, dist_columns], 1, function(x) min(as.numeric(x), na.rm = TRUE))

# Check whether the calculation has worked
summary(county_leukemia_128$MinDist2Frac)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.7436  1.3410  1.4750  2.1443  3.8535
# Check the first values of the calculated distance
head(county_leukemia_128$MinDist2Frac)
## [1] 0.2504166 0.7315699 0.4680598 1.2334271 0.9848940 1.3183907
library(ggplot2)
library(viridis)
## Warning: Paket 'viridis' wurde unter R Version 4.4.2 erstellt
## Lade nötiges Paket: viridisLite
library(sf)

# Plot of the map with leukemia cases and distance to fracking sites
ggplot(data = county_leukemia_128) +
  # Plot the geometry of the districts (polygons of the districts)
  geom_sf(aes(fill = leuk_cases), color = "black") +
  
  # Plot points for fracking sites (with color for distance)
  geom_point(aes(x = lon, y = lat, color = MinDist2Frac), alpha = 0.6) +
  
  # Color scales for leukemia cases and distance to fracking sites
  scale_fill_viridis_c(option = "plasma") +  # Color scale for leukemia cases
  scale_color_viridis_c(option = "magma") +  # Color scale for distances
  
  # Adding labels and titles
  labs(
    title = "Geographical pattern of leukemia cases and proximity to fracking sites",
    fill = "Leukemia cases",
    color = "Distance to fracking"
  ) +
  
  theme_minimal()

library(ggplot2)
library(viridis)
library(sf)

shape_map <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)  # Allocation of forms 1-10 for the fracking sites

# Plot of the map with leukemia cases and distance to fracking sites
ggplot(data = county_leukemia_128) +
  # Plot the geometry of the districts (polygons of the districts)
  geom_sf(aes(fill = leuk_cases), color = "black") +
  
  # Fracking sites as white dots with different shapes and thicker edges
  geom_sf(data = fracking_locations_128, aes(color = "white"),
          size = 4,  # Size of the fracking sites shapes
          shape = shape_map[fracking_locations_128$FracLoc]  # Different forms for each fracking site ID
          ) +
  
  scale_fill_viridis_c(option = "plasma", direction = -1) +  # Color scales for leukemia cases
  
  # Manual shape assignment for the fracking sites
  scale_shape_manual(values = shape_map) +
  
  labs(
    title = "Geographical pattern of leukemia cases and fracking sites",
    fill = "Leukemia cases",
    shape = "Fracking Sites"
  ) +
  
  theme_minimal()

Overview map showing leukemia cases and displaying only those fracking sites that show a negative correlation with the number of leukemia cases:

library(ggplot2)
library(viridis)
library(sf)
library(dplyr)

# 1. calculate the correlation between the fracking sites and the leukemia cases
# Here we assume that the column 'Dist2FracLoc_*' in the data set contains the distances to the fracking sites.

# We need to calculate the correlation of the leukemia cases with the distances to the fracking sites.
leuk_cases_values <- county_leukemia_128$leuk_cases
cor_values <- sapply(1:10, function(i) cor(leuk_cases_values, county_leukemia_128[[paste("Dist2FracLoc_", i, sep = "")]], use = "complete.obs"))

# 2. Filter only the fracking sites with a correlation < 0
positive_corr_indices <- which(cor_values < 0)

# Create a new data set with the filtered fracking locations
filtered_fracking <- fracking_locations_128[positive_corr_indices, ]

shape_map <- 1:length(positive_corr_indices)  # Shapes for the fracking sites

ggplot(data = county_leukemia_128) +
  geom_sf(aes(fill = leuk_cases), color = "black") +
  
  # Show filtered fracking sites with different shapes
  geom_sf(
    data = filtered_fracking,
    aes(geometry = geometry, shape = as.factor(FracLoc)),  # Different shapes for locations
    size = 5,                  
    color = "green",
    stroke = 1.5               
  ) +
  
  scale_fill_viridis_c(option = "plasma", direction = -1) +
  
  # Manual shape allocation for fracking sites
  scale_shape_manual(values = shape_map) +
  
  labs(
    title = "Leukemia cases and fracking sites with negative correlation < -0.25",
    fill = "Leukemia cases",
    shape = "Fracking sites"  # Legend for the shapes
  ) +
  
  theme_minimal() +
  
  # Position the legend under the graphic and remove NA
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    legend.key = element_rect(fill = "white", colour = "black"),
    legend.box = "horizontal" 
  ) +
  
  # Remove the NA value from the legend
  scale_shape_manual(values = shape_map) + 
  guides(shape = guide_legend(override.aes = list(shape = shape_map)))
## Scale for shape is already present.
## Adding another scale for shape, which will replace the existing scale.

library(ggplot2)
library(viridis)
library(sf)

# Plot of the map with PctPopGrowth as color scale
ggplot(data = county_leukemia_128) +
  # Coloring the geometry of districts based on population growth
  geom_sf(aes(fill = PctPopGrowth), color = "black") +
  
    # Color scale for population growth
  scale_fill_viridis_c(option = "magma", name = "Population growth (%)") +
 
  labs(
    title = "Population growth in the districts",
    fill = "Population growth (%)"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    legend.key = element_rect(fill = "white", colour = "black")
  )

library(ggplot2)
library(viridis)
library(sf)

shape_map <- c(1:10)  # Various shapes for fracking sites

ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = PctPopGrowth), color = "black") +
  
  # Color scale for population growth
  scale_fill_viridis_c(option = "magma", name = "Population growth (%)", direction = -1) +
  
  geom_sf(data = fracking_locations_128, 
          aes(shape = factor(FracLoc)),  # Shape based on the FracLoc ID
          color = "green", size = 3,
          show.legend = TRUE) +
  
  # Manual shape assignment for fracking sites (1-10 shapes)
  scale_shape_manual(values = shape_map) +
  
  labs(
    title = "Population growth and fracking sites",
    fill = "Population growth (%)",
    shape = "Fracking site"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom", 
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6),
    legend.key = element_rect(fill = "white", colour = "black"),
    legend.box = "horizontal",
    legend.margin = margin(t = 8)
  )

library(ggplot2)
library(viridis)
library(sf)
library(gridExtra)
## Warning: Paket 'gridExtra' wurde unter R Version 4.4.2 erstellt
## 
## Attache Paket: 'gridExtra'
## Das folgende Objekt ist maskiert 'package:dplyr':
## 
##     combine
leukemia_map <- ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = leuk_cases), color = "black") +
  
  scale_fill_viridis_c(option = "plasma", name = "Leukemia cases", direction = -1) +
  
  labs(
    title = "Leukemia cases per district",
    fill = "Leukemia cases"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8)
  )

popgrowth_map <- ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = PctPopGrowth), color = "black") +
  
  scale_fill_viridis_c(option = "magma", name = "Population growth (%)", direction = -1) +
  
  labs(
    title = "Population growth per district",
    fill = "Population growth (%)"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8)
  )

# Grafics side by side
grid.arrange(leukemia_map, popgrowth_map, ncol = 2)

#- Number of Leukemia Cases per population und dann übersicht ob es ansteigt wenn mehr Leute da sind #- Vergleiche am Anfang den Average mit der Standardabweichung (am besten mit describe())

library(ggplot2)
library(viridis)
library(sf)

shape_map <- c(1:10)  # Various shapes for fracking sites

ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = GreenNeighborhoodIndex), color = "black") +
  
  # Color scale for Green Index
  scale_fill_gradient(low = "lightgreen", high = "darkgreen", name = "Green NeighborhoodIndex") +
  
  geom_sf(data = fracking_locations_128, 
          aes(shape = factor(FracLoc)),  # Shape based on the FracLoc ID
          color = "blue", size = 5,
          show.legend = TRUE) +
  
  # Manual shape assignment for fracking sites (1-10 shapes)
  scale_shape_manual(values = shape_map) +
  
  labs(
    title = "Green Neighborhood Index and Fracking Sites",
    fill = "GreenNeighborhoodIndex",
    shape = "Fracking site"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom", 
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    legend.key = element_rect(fill = "white", colour = "black"),
    legend.box = "horizontal",
    legend.margin = margin(t = 10)
  )

library(ggplot2)
library(viridis)
library(sf)
library(gridExtra)

leukemia_map <- ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = leuk_cases), color = "black") +
  
  scale_fill_viridis_c(option = "plasma", name = "Leukemia cases") +
  
  labs(
    title = "Leukemia cases per district",
    fill = "Leukemia cases"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8)
  )

popgrowth_map <- ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = CarsPerHH), color = "black") +
  
  scale_fill_viridis_c(option = "magma", name = "Cars per HH") +
  
  labs(
    title = "Cars per Household",
    fill = "Cars per HH"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8)
  )

# Grafics side by side
grid.arrange(leukemia_map, popgrowth_map, ncol = 2)

library(ggplot2)
library(viridis)
library(sf)

shape_map <- c(1:10)  # Various shapes for fracking sites

ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = leuk_cases), color = "black") +
  
  # Color scale for population growth
  scale_fill_viridis_c(option = "magma", name = "Leukemia Cases", direction = -1) +
  
  geom_sf(data = fracking_locations_128, 
          aes(shape = factor(FracLoc)),  # Shape based on the FracLoc ID
          color = "green", size = 3,
          show.legend = TRUE) +
  
  # Manual shape assignment for fracking sites (1-10 shapes)
  scale_shape_manual(values = shape_map) +
  
  labs(
    title = "Leukemia Cases and fracking sites",
    fill = "Leukemia Cases",
    shape = "Fracking site"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom", 
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6),
    legend.key = element_rect(fill = "white", colour = "black"),
    legend.box = "horizontal",
    legend.margin = margin(t = 8)
  )

shape_map <- c(1:10)  # Various shapes for fracking sites

ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = AverageAge), color = "black") +
  
  # Color scale for population growth
  scale_fill_manual(values = c("15 and younger" = "lightblue", 
                               "15-50" = "orange", 
                               "50 and older" = "red", name = "Average Age")) +
  
  geom_sf(data = fracking_locations_128, 
          aes(shape = factor(FracLoc)),  # Shape based on the FracLoc ID
          color = "green", size = 5,
          stroke = 1.5,
          show.legend = TRUE) +
  
  # Manual shape assignment for fracking sites (1-10 shapes)
  scale_shape_manual(values = shape_map) +
  
  labs(
    fill = "Average Age",
    shape = "Fracking site"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom", 
    legend.title = element_text(size = 6),
    legend.text = element_text(size = 4),
    legend.key = element_rect(fill = "white", colour = "black"),
    legend.box = "horizontal",
    legend.margin = margin(t = 6)
  )

library(ggplot2)
library(viridis)
library(sf)

shape_map <- c(1:10)  # Various shapes for fracking sites

ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = PopDens), color = "black") +
  
  # Color scale for population growth
  scale_fill_viridis_c(option = "magma", name = "PopDens", direction = -1) +
  
  geom_sf(data = fracking_locations_128, 
          aes(shape = factor(FracLoc)),  # Shape based on the FracLoc ID
          color = "green", size = 5, stroke = 1.5,
          show.legend = TRUE) +
  
  # Manual shape assignment for fracking sites (1-10 shapes)
  scale_shape_manual(values = shape_map) +
  
  labs(
    title = "Population Density and fracking sites",
    fill = "Population Density",
    shape = "Fracking site"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom", 
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6),
    legend.key = element_rect(fill = "white", colour = "black"),
    legend.box = "horizontal",
    legend.margin = margin(t = 8)
  )

library(ggplot2)
library(viridis)
library(sf)

shape_map <- c(1:10)  # Various shapes for fracking sites

ggplot() +
  geom_sf(data = county_leukemia_128, aes(fill = CarsPerHH), color = "black") +
  
  # Color scale for population growth
  scale_fill_viridis_c(option = "magma", name = "Cars per HH", direction = -1) +
  
  geom_sf(data = fracking_locations_128, 
          aes(shape = factor(FracLoc)),  # Shape based on the FracLoc ID
          color = "green", size = 5, stroke = 1.5,
          show.legend = TRUE) +
  
  # Manual shape assignment for fracking sites (1-10 shapes)
  scale_shape_manual(values = shape_map) +
  
  labs(
    title = "Cars per HH and fracking sites",
    fill = "Cars per HH",
    shape = "Fracking site"
  ) +
  
  theme_minimal() +
  
  theme(
    legend.position = "bottom", 
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6),
    legend.key = element_rect(fill = "white", colour = "black"),
    legend.box = "horizontal",
    legend.margin = margin(t = 8)
  )