# Load necessary libraries
pacman::p_load(pacman, readr, dplyr, skimr, naniar, ggplot2, tidyr, patchwork)
# Load the dataset with appropriate encoding
my_data <- read_csv("VIW_FNT.csv", locale = locale(encoding = "UTF-8"))
## Rows: 156291 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): WHOREGION, FLUSEASON, HEMISPHERE, ITZ, COUNTRY_CODE, COUNTRY_AREA...
## dbl (35): ISO_YEAR, ISO_WEEK, MMWR_YEAR, MMWR_WEEK, SPEC_PROCESSED_NB, SPEC...
## date (2): ISO_WEEKSTARTDATE, MMWR_WEEKSTARTDATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Identify numeric and character columns
numeric_columns <- names(my_data)[sapply(my_data, is.numeric)]
character_columns <- names(my_data)[sapply(my_data, is.character)]
# Replace NA values in numeric columns with 0 (or another appropriate numeric value)
my_data <- my_data %>%
mutate(across(all_of(numeric_columns), ~ replace_na(.x, 0)))
# Replace NA values in character columns with empty string (or another appropriate character value)
my_data <- my_data %>%
mutate(across(all_of(character_columns), ~ replace_na(.x, "")))
# Basic Summary
summary(my_data)
## WHOREGION FLUSEASON HEMISPHERE ITZ
## Length:156291 Length:156291 Length:156291 Length:156291
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## COUNTRY_CODE COUNTRY_AREA_TERRITORY ISO_WEEKSTARTDATE ISO_YEAR
## Length:156291 Length:156291 Min. :1995-01-02 Min. :1995
## Class :character Class :character 1st Qu.:2012-02-06 1st Qu.:2012
## Mode :character Mode :character Median :2017-02-20 Median :2017
## Mean :2015-11-07 Mean :2015
## 3rd Qu.:2021-02-22 3rd Qu.:2021
## Max. :2024-05-13 Max. :2024
## ISO_WEEK MMWR_WEEKSTARTDATE MMWR_YEAR MMWR_WEEK
## Min. : 1.00 Min. :1995-01-01 Min. :1995 Min. : 1.00
## 1st Qu.:12.00 1st Qu.:2012-02-05 1st Qu.:2012 1st Qu.:12.00
## Median :25.00 Median :2017-02-19 Median :2017 Median :25.00
## Mean :25.78 Mean :2015-11-06 Mean :2015 Mean :25.78
## 3rd Qu.:40.00 3rd Qu.:2021-02-21 3rd Qu.:2021 3rd Qu.:40.00
## Max. :53.00 Max. :2024-05-12 Max. :2024 Max. :53.00
## ORIGIN_SOURCE SPEC_PROCESSED_NB SPEC_RECEIVED_NB AH1N12009
## Length:156291 Min. : 0.0 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 2.0 1st Qu.: 0.0 1st Qu.: 0.00
## Mode :character Median : 23.0 Median : 0.0 Median : 0.00
## Mean : 463.2 Mean : 295.9 Mean : 11.27
## 3rd Qu.: 91.0 3rd Qu.: 13.0 3rd Qu.: 1.00
## Max. :191785.0 Max. :191785.0 Max. :10575.00
## AH1 AH3 AH5 AH7N9
## Min. : 0.0000 Min. : 0 Min. : 0.000000 Min. :0.000000
## 1st Qu.: 0.0000 1st Qu.: 0 1st Qu.: 0.000000 1st Qu.:0.000000
## Median : 0.0000 Median : 0 Median : 0.000000 Median :0.000000
## Mean : 0.5522 Mean : 11 Mean : 0.002604 Mean :0.000282
## 3rd Qu.: 0.0000 3rd Qu.: 1 3rd Qu.: 0.000000 3rd Qu.:0.000000
## Max. :906.0000 Max. :14991 Max. :14.000000 Max. :6.000000
## ANOTSUBTYPED ANOTSUBTYPABLE AOTHER_SUBTYPE
## Min. : 0.00 Min. : 0.00000 Min. : 0.00000
## 1st Qu.: 0.00 1st Qu.: 0.00000 1st Qu.: 0.00000
## Median : 0.00 Median : 0.00000 Median : 0.00000
## Mean : 20.13 Mean : 0.01832 Mean : 0.00519
## 3rd Qu.: 0.00 3rd Qu.: 0.00000 3rd Qu.: 0.00000
## Max. :48835.00 Max. :58.00000 Max. :95.00000
## AOTHER_SUBTYPE_DETAILS INF_A BVIC_2DEL
## Length:156291 Min. : 0.00 Min. : 0.00000
## Class :character 1st Qu.: 0.00 1st Qu.: 0.00000
## Mode :character Median : 0.00 Median : 0.00000
## Mean : 42.99 Mean : 0.01976
## 3rd Qu.: 6.00 3rd Qu.: 0.00000
## Max. :48835.00 Max. :175.00000
## BVIC_3DEL BVIC_NODEL BVIC_DELUNK BYAM
## Min. :0.000000 Min. : 0.00 Min. : 0.00000 Min. : 0.0000
## 1st Qu.:0.000000 1st Qu.: 0.00 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median :0.000000 Median : 0.00 Median : 0.00000 Median : 0.0000
## Mean :0.000409 Mean : 2.19 Mean : 0.06771 Mean : 0.9828
## 3rd Qu.:0.000000 3rd Qu.: 0.00 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :5.000000 Max. :6596.00 Max. :129.00000 Max. :2641.0000
## BNOTDETERMINED INF_B INF_ALL INF_NEGATIVE
## Min. : -1.000 Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0
## Median : 0.000 Median : 0.00 Median : 1.00 Median : 0.0
## Mean : 9.103 Mean : 12.28 Mean : 55.26 Mean : 193.3
## 3rd Qu.: 1.000 3rd Qu.: 2.00 3rd Qu.: 10.00 3rd Qu.: 0.0
## Max. :11264.000 Max. :11264.00 Max. :49007.00 Max. :147198.0
## ILI_ACTIVITY ADENO BOCA HUMAN_CORONA
## Min. :0.000 Min. : 0.0000 Min. : 0.00000 Min. : 0.0000
## 1st Qu.:0.000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median :0.000 Median : 0.0000 Median : 0.00000 Median : 0.0000
## Mean :1.165 Mean : 0.8912 Mean : 0.06439 Mean : 0.4873
## 3rd Qu.:2.000 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :6.000 Max. :376.0000 Max. :226.00000 Max. :708.0000
## METAPNEUMO PARAINFLUENZA RHINO RSV
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.0000 Median : 0.000 Median : 0.000 Median : 0.000
## Mean : 0.8256 Mean : 1.253 Mean : 1.944 Mean : 8.256
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :723.0000 Max. :609.000 Max. :1416.000 Max. :3523.000
## OTHERRESPVIRUS OTHER_RESPVIRUS_DETAILS LAB_RESULT_COMMENT
## Min. : 0.000 Length:156291 Length:156291
## 1st Qu.: 0.000 Class :character Class :character
## Median : 0.000 Mode :character Mode :character
## Mean : 1.036
## 3rd Qu.: 0.000
## Max. :732.000
## WCR_COMMENT ISO2 ISOYW MMWRYW
## Length:156291 Length:156291 Min. :199501 Min. :199501
## Class :character Class :character 1st Qu.:201206 1st Qu.:201206
## Mode :character Mode :character Median :201708 Median :201708
## Mean :201563 Mean :201563
## 3rd Qu.:202108 3rd Qu.:202108
## Max. :202420 Max. :202420
# Extended Summary with 'skimr'
skim(my_data)
Data summary
Name |
my_data |
Number of rows |
156291 |
Number of columns |
49 |
_______________________ |
|
Column type frequency: |
|
character |
12 |
Date |
2 |
numeric |
35 |
________________________ |
|
Group variables |
None |
Variable type: character
WHOREGION |
0 |
1 |
3 |
4 |
0 |
7 |
0 |
FLUSEASON |
0 |
1 |
2 |
2 |
0 |
3 |
0 |
HEMISPHERE |
0 |
1 |
2 |
2 |
0 |
2 |
0 |
ITZ |
0 |
1 |
10 |
15 |
0 |
18 |
0 |
COUNTRY_CODE |
0 |
1 |
3 |
3 |
0 |
186 |
0 |
COUNTRY_AREA_TERRITORY |
0 |
1 |
4 |
70 |
0 |
186 |
0 |
ORIGIN_SOURCE |
0 |
1 |
8 |
11 |
0 |
3 |
0 |
AOTHER_SUBTYPE_DETAILS |
0 |
1 |
0 |
125 |
149312 |
82 |
0 |
OTHER_RESPVIRUS_DETAILS |
0 |
1 |
0 |
320 |
132553 |
5593 |
0 |
LAB_RESULT_COMMENT |
0 |
1 |
0 |
996 |
151688 |
1430 |
0 |
WCR_COMMENT |
0 |
1 |
0 |
450 |
138621 |
11445 |
0 |
ISO2 |
0 |
1 |
0 |
2 |
93 |
186 |
0 |
Variable type: Date
ISO_WEEKSTARTDATE |
0 |
1 |
1995-01-02 |
2024-05-13 |
2017-02-20 |
1506 |
MMWR_WEEKSTARTDATE |
0 |
1 |
1995-01-01 |
2024-05-12 |
2017-02-19 |
1506 |
Variable type: numeric
ISO_YEAR |
0 |
1 |
2015.37 |
6.50 |
1995 |
2012 |
2017 |
2021 |
2024 |
▁▂▃▇▇ |
ISO_WEEK |
0 |
1 |
25.78 |
15.70 |
1 |
12 |
25 |
40 |
53 |
▇▆▆▆▇ |
MMWR_YEAR |
0 |
1 |
2015.37 |
6.50 |
1995 |
2012 |
2017 |
2021 |
2024 |
▁▂▃▇▇ |
MMWR_WEEK |
0 |
1 |
25.78 |
15.70 |
1 |
12 |
25 |
40 |
53 |
▇▆▆▆▇ |
SPEC_PROCESSED_NB |
0 |
1 |
463.20 |
3457.80 |
0 |
2 |
23 |
91 |
191785 |
▇▁▁▁▁ |
SPEC_RECEIVED_NB |
0 |
1 |
295.94 |
3277.10 |
0 |
0 |
0 |
13 |
191785 |
▇▁▁▁▁ |
AH1N12009 |
0 |
1 |
11.27 |
131.42 |
0 |
0 |
0 |
1 |
10575 |
▇▁▁▁▁ |
AH1 |
0 |
1 |
0.55 |
10.48 |
0 |
0 |
0 |
0 |
906 |
▇▁▁▁▁ |
AH3 |
0 |
1 |
11.00 |
124.37 |
0 |
0 |
0 |
1 |
14991 |
▇▁▁▁▁ |
AH5 |
0 |
1 |
0.00 |
0.11 |
0 |
0 |
0 |
0 |
14 |
▇▁▁▁▁ |
AH7N9 |
0 |
1 |
0.00 |
0.03 |
0 |
0 |
0 |
0 |
6 |
▇▁▁▁▁ |
ANOTSUBTYPED |
0 |
1 |
20.13 |
378.66 |
0 |
0 |
0 |
0 |
48835 |
▇▁▁▁▁ |
ANOTSUBTYPABLE |
0 |
1 |
0.02 |
0.61 |
0 |
0 |
0 |
0 |
58 |
▇▁▁▁▁ |
AOTHER_SUBTYPE |
0 |
1 |
0.01 |
0.38 |
0 |
0 |
0 |
0 |
95 |
▇▁▁▁▁ |
INF_A |
0 |
1 |
42.99 |
442.32 |
0 |
0 |
0 |
6 |
48835 |
▇▁▁▁▁ |
BVIC_2DEL |
0 |
1 |
0.02 |
1.12 |
0 |
0 |
0 |
0 |
175 |
▇▁▁▁▁ |
BVIC_3DEL |
0 |
1 |
0.00 |
0.03 |
0 |
0 |
0 |
0 |
5 |
▇▁▁▁▁ |
BVIC_NODEL |
0 |
1 |
2.19 |
59.47 |
0 |
0 |
0 |
0 |
6596 |
▇▁▁▁▁ |
BVIC_DELUNK |
0 |
1 |
0.07 |
1.54 |
0 |
0 |
0 |
0 |
129 |
▇▁▁▁▁ |
BYAM |
0 |
1 |
0.98 |
22.53 |
0 |
0 |
0 |
0 |
2641 |
▇▁▁▁▁ |
BNOTDETERMINED |
0 |
1 |
9.10 |
121.90 |
-1 |
0 |
0 |
1 |
11264 |
▇▁▁▁▁ |
INF_B |
0 |
1 |
12.28 |
140.00 |
0 |
0 |
0 |
2 |
11264 |
▇▁▁▁▁ |
INF_ALL |
0 |
1 |
55.26 |
519.78 |
0 |
0 |
1 |
10 |
49007 |
▇▁▁▁▁ |
INF_NEGATIVE |
0 |
1 |
193.26 |
2840.85 |
0 |
0 |
0 |
0 |
147198 |
▇▁▁▁▁ |
ILI_ACTIVITY |
0 |
1 |
1.16 |
1.56 |
0 |
0 |
0 |
2 |
6 |
▇▁▂▁▁ |
ADENO |
0 |
1 |
0.89 |
8.09 |
0 |
0 |
0 |
0 |
376 |
▇▁▁▁▁ |
BOCA |
0 |
1 |
0.06 |
1.23 |
0 |
0 |
0 |
0 |
226 |
▇▁▁▁▁ |
HUMAN_CORONA |
0 |
1 |
0.49 |
9.71 |
0 |
0 |
0 |
0 |
708 |
▇▁▁▁▁ |
METAPNEUMO |
0 |
1 |
0.83 |
11.55 |
0 |
0 |
0 |
0 |
723 |
▇▁▁▁▁ |
PARAINFLUENZA |
0 |
1 |
1.25 |
13.50 |
0 |
0 |
0 |
0 |
609 |
▇▁▁▁▁ |
RHINO |
0 |
1 |
1.94 |
26.23 |
0 |
0 |
0 |
0 |
1416 |
▇▁▁▁▁ |
RSV |
0 |
1 |
8.26 |
72.92 |
0 |
0 |
0 |
0 |
3523 |
▇▁▁▁▁ |
OTHERRESPVIRUS |
0 |
1 |
1.04 |
17.50 |
0 |
0 |
0 |
0 |
732 |
▇▁▁▁▁ |
ISOYW |
0 |
1 |
201563.16 |
649.42 |
199501 |
201206 |
201708 |
202108 |
202420 |
▁▂▃▆▇ |
MMWRYW |
0 |
1 |
201563.02 |
649.52 |
199501 |
201206 |
201708 |
202108 |
202420 |
▁▂▃▆▇ |
# Missing Values Analysis
total_missing <- sum(is.na(my_data))
col_missing <- colSums(is.na(my_data))
# Print missing values summary
cat("Total missing values:", total_missing, "\n")
## Total missing values: 0
print(col_missing)
## WHOREGION FLUSEASON HEMISPHERE
## 0 0 0
## ITZ COUNTRY_CODE COUNTRY_AREA_TERRITORY
## 0 0 0
## ISO_WEEKSTARTDATE ISO_YEAR ISO_WEEK
## 0 0 0
## MMWR_WEEKSTARTDATE MMWR_YEAR MMWR_WEEK
## 0 0 0
## ORIGIN_SOURCE SPEC_PROCESSED_NB SPEC_RECEIVED_NB
## 0 0 0
## AH1N12009 AH1 AH3
## 0 0 0
## AH5 AH7N9 ANOTSUBTYPED
## 0 0 0
## ANOTSUBTYPABLE AOTHER_SUBTYPE AOTHER_SUBTYPE_DETAILS
## 0 0 0
## INF_A BVIC_2DEL BVIC_3DEL
## 0 0 0
## BVIC_NODEL BVIC_DELUNK BYAM
## 0 0 0
## BNOTDETERMINED INF_B INF_ALL
## 0 0 0
## INF_NEGATIVE ILI_ACTIVITY ADENO
## 0 0 0
## BOCA HUMAN_CORONA METAPNEUMO
## 0 0 0
## PARAINFLUENZA RHINO RSV
## 0 0 0
## OTHERRESPVIRUS OTHER_RESPVIRUS_DETAILS LAB_RESULT_COMMENT
## 0 0 0
## WCR_COMMENT ISO2 ISOYW
## 0 0 0
## MMWRYW
## 0
# Plot 1: Visualize missing values
p_miss <- gg_miss_var(my_data) +
labs(title = "Missing Values in Dataset",
x = "Variables",
y = "Number of Missing Values")
print(p_miss)

# Visualizations for Structure
str(my_data)
## tibble [156,291 × 49] (S3: tbl_df/tbl/data.frame)
## $ WHOREGION : chr [1:156291] "EUR" "AMR" "EUR" "WPR" ...
## $ FLUSEASON : chr [1:156291] "NH" "YR" "NH" "YR" ...
## $ HEMISPHERE : chr [1:156291] "NH" "SH" "NH" "NH" ...
## $ ITZ : chr [1:156291] "FLU_WST_ASIA" "FLU_TRP_SAMR" "FLU_SW_EUR" "FLU_SE_ASIA" ...
## $ COUNTRY_CODE : chr [1:156291] "AZE" "BRA" "HRV" "PHL" ...
## $ COUNTRY_AREA_TERRITORY : chr [1:156291] "Azerbaijan" "Brazil" "Croatia" "Philippines" ...
## $ ISO_WEEKSTARTDATE : Date[1:156291], format: "2015-11-16" "2009-06-15" ...
## $ ISO_YEAR : num [1:156291] 2015 2009 2014 2004 2024 ...
## $ ISO_WEEK : num [1:156291] 47 25 7 3 7 33 21 46 2 43 ...
## $ MMWR_WEEKSTARTDATE : Date[1:156291], format: "2015-11-15" "2009-06-14" ...
## $ MMWR_YEAR : num [1:156291] 2015 2009 2014 2004 2024 ...
## $ MMWR_WEEK : num [1:156291] 46 25 7 2 7 33 20 46 2 43 ...
## $ ORIGIN_SOURCE : chr [1:156291] "NONSENTINEL" "NOTDEFINED" "SENTINEL" "NOTDEFINED" ...
## $ SPEC_PROCESSED_NB : num [1:156291] 5 227 149 58 169 ...
## $ SPEC_RECEIVED_NB : num [1:156291] 0 267 0 0 169 ...
## $ AH1N12009 : num [1:156291] 0 85 0 0 52 0 0 264 0 18 ...
## $ AH1 : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AH3 : num [1:156291] 0 1 0 0 0 0 1 30 0 0 ...
## $ AH5 : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AH7N9 : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ ANOTSUBTYPED : num [1:156291] 0 34 0 0 0 0 1 4 0 0 ...
## $ ANOTSUBTYPABLE : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AOTHER_SUBTYPE : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AOTHER_SUBTYPE_DETAILS : chr [1:156291] "" "" "" "" ...
## $ INF_A : num [1:156291] 0 120 0 0 52 0 2 298 0 18 ...
## $ BVIC_2DEL : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ BVIC_3DEL : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ BVIC_NODEL : num [1:156291] 0 0 0 0 13 0 0 3 0 0 ...
## $ BVIC_DELUNK : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ BYAM : num [1:156291] 0 0 0 0 0 0 4 7 0 0 ...
## $ BNOTDETERMINED : num [1:156291] 0 7 0 0 5 0 0 0 0 0 ...
## $ INF_B : num [1:156291] 0 7 0 0 18 0 4 10 0 0 ...
## $ INF_ALL : num [1:156291] 0 127 0 0 70 0 6 308 0 18 ...
## $ INF_NEGATIVE : num [1:156291] 0 0 0 0 99 ...
## $ ILI_ACTIVITY : num [1:156291] 0 6 0 2 0 0 0 3 0 3 ...
## $ ADENO : num [1:156291] 0 0 0 0 0 0 11 0 0 0 ...
## $ BOCA : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ HUMAN_CORONA : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ METAPNEUMO : num [1:156291] 0 0 0 0 0 0 1 0 0 0 ...
## $ PARAINFLUENZA : num [1:156291] 0 0 0 0 0 0 51 0 0 0 ...
## $ RHINO : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ RSV : num [1:156291] 0 0 0 0 1 0 30 0 11 0 ...
## $ OTHERRESPVIRUS : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ OTHER_RESPVIRUS_DETAILS: chr [1:156291] "" "0" "" "Rhinovirus" ...
## $ LAB_RESULT_COMMENT : chr [1:156291] "" "No unsubtyped or antiviral resistence was found." "" "" ...
## $ WCR_COMMENT : chr [1:156291] "" "The Flu seasonal activity is low and H1N1 pandemic first detection." "" "" ...
## $ ISO2 : chr [1:156291] "AZ" "BR" "HR" "PH" ...
## $ ISOYW : num [1:156291] 201547 200925 201407 200403 202407 ...
## $ MMWRYW : num [1:156291] 201546 200925 201407 200402 202407 ...
# Convert appropriate columns to factors or dates
my_data <- my_data %>%
mutate(
FLUSEASON = as.factor(FLUSEASON),
HEMISPHERE = as.factor(HEMISPHERE),
ISO_WEEKSTARTDATE = as.Date(ISO_WEEKSTARTDATE, format="%Y-%m-%d"),
MMWR_WEEKSTARTDATE = as.Date(MMWR_WEEKSTARTDATE, format="%Y-%m-%d")
)
# Recheck the structure
str(my_data)
## tibble [156,291 × 49] (S3: tbl_df/tbl/data.frame)
## $ WHOREGION : chr [1:156291] "EUR" "AMR" "EUR" "WPR" ...
## $ FLUSEASON : Factor w/ 3 levels "NH","SH","YR": 1 3 1 3 3 1 2 3 1 1 ...
## $ HEMISPHERE : Factor w/ 2 levels "NH","SH": 1 2 1 1 1 1 2 1 1 1 ...
## $ ITZ : chr [1:156291] "FLU_WST_ASIA" "FLU_TRP_SAMR" "FLU_SW_EUR" "FLU_SE_ASIA" ...
## $ COUNTRY_CODE : chr [1:156291] "AZE" "BRA" "HRV" "PHL" ...
## $ COUNTRY_AREA_TERRITORY : chr [1:156291] "Azerbaijan" "Brazil" "Croatia" "Philippines" ...
## $ ISO_WEEKSTARTDATE : Date[1:156291], format: "2015-11-16" "2009-06-15" ...
## $ ISO_YEAR : num [1:156291] 2015 2009 2014 2004 2024 ...
## $ ISO_WEEK : num [1:156291] 47 25 7 3 7 33 21 46 2 43 ...
## $ MMWR_WEEKSTARTDATE : Date[1:156291], format: "2015-11-15" "2009-06-14" ...
## $ MMWR_YEAR : num [1:156291] 2015 2009 2014 2004 2024 ...
## $ MMWR_WEEK : num [1:156291] 46 25 7 2 7 33 20 46 2 43 ...
## $ ORIGIN_SOURCE : chr [1:156291] "NONSENTINEL" "NOTDEFINED" "SENTINEL" "NOTDEFINED" ...
## $ SPEC_PROCESSED_NB : num [1:156291] 5 227 149 58 169 ...
## $ SPEC_RECEIVED_NB : num [1:156291] 0 267 0 0 169 ...
## $ AH1N12009 : num [1:156291] 0 85 0 0 52 0 0 264 0 18 ...
## $ AH1 : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AH3 : num [1:156291] 0 1 0 0 0 0 1 30 0 0 ...
## $ AH5 : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AH7N9 : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ ANOTSUBTYPED : num [1:156291] 0 34 0 0 0 0 1 4 0 0 ...
## $ ANOTSUBTYPABLE : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AOTHER_SUBTYPE : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ AOTHER_SUBTYPE_DETAILS : chr [1:156291] "" "" "" "" ...
## $ INF_A : num [1:156291] 0 120 0 0 52 0 2 298 0 18 ...
## $ BVIC_2DEL : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ BVIC_3DEL : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ BVIC_NODEL : num [1:156291] 0 0 0 0 13 0 0 3 0 0 ...
## $ BVIC_DELUNK : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ BYAM : num [1:156291] 0 0 0 0 0 0 4 7 0 0 ...
## $ BNOTDETERMINED : num [1:156291] 0 7 0 0 5 0 0 0 0 0 ...
## $ INF_B : num [1:156291] 0 7 0 0 18 0 4 10 0 0 ...
## $ INF_ALL : num [1:156291] 0 127 0 0 70 0 6 308 0 18 ...
## $ INF_NEGATIVE : num [1:156291] 0 0 0 0 99 ...
## $ ILI_ACTIVITY : num [1:156291] 0 6 0 2 0 0 0 3 0 3 ...
## $ ADENO : num [1:156291] 0 0 0 0 0 0 11 0 0 0 ...
## $ BOCA : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ HUMAN_CORONA : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ METAPNEUMO : num [1:156291] 0 0 0 0 0 0 1 0 0 0 ...
## $ PARAINFLUENZA : num [1:156291] 0 0 0 0 0 0 51 0 0 0 ...
## $ RHINO : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ RSV : num [1:156291] 0 0 0 0 1 0 30 0 11 0 ...
## $ OTHERRESPVIRUS : num [1:156291] 0 0 0 0 0 0 0 0 0 0 ...
## $ OTHER_RESPVIRUS_DETAILS: chr [1:156291] "" "0" "" "Rhinovirus" ...
## $ LAB_RESULT_COMMENT : chr [1:156291] "" "No unsubtyped or antiviral resistence was found." "" "" ...
## $ WCR_COMMENT : chr [1:156291] "" "The Flu seasonal activity is low and H1N1 pandemic first detection." "" "" ...
## $ ISO2 : chr [1:156291] "AZ" "BR" "HR" "PH" ...
## $ ISOYW : num [1:156291] 201547 200925 201407 200403 202407 ...
## $ MMWRYW : num [1:156291] 201546 200925 201407 200402 202407 ...
# Visualizing key numeric columns
numeric_long <- my_data %>%
select(all_of(numeric_columns)) %>%
gather(key = "Variable", value = "Value")
# Plot 2: Histograms for Numeric Columns
p_hist <- ggplot(numeric_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Histograms of Numeric Columns",
x = "Value",
y = "Frequency")
print(p_hist)

# Plot 3: Boxplot for a numeric column by FLUSEASON
p_box <- ggplot(my_data, aes(x = FLUSEASON, y = SPEC_PROCESSED_NB)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplot of SPEC_PROCESSED_NB by FLUSEASON",
x = "FLUSEASON",
y = "SPEC_PROCESSED_NB")
print(p_box)

# Plot 4: Time Series Plot for SPEC_PROCESSED_NB over ISO_WEEKSTARTDATE
p_time <- ggplot(my_data, aes(x = ISO_WEEKSTARTDATE, y = SPEC_PROCESSED_NB)) +
geom_line() +
theme_minimal() +
labs(title = "Time Series of SPEC_PROCESSED_NB",
x = "ISO Week Start Date",
y = "SPEC_PROCESSED_NB")
print(p_time)
