# connection string
ch <- odbcConnect("DMU_CMK", uid = "TRACKINGDATAREADER", pwd = "p4_PISk=+MW+Z2~%u_el20zkt")

#################Pull BD Tier 4
BDdata <-sqlQuery(ch, paste("SELECT [State_CD], [County_cd], [Birth_def_CD], [Indicator_num], [Measure_Num], 
                            [Measure_Type], [RPT_YR], [ROLLING_YEAR_COUNT],[INFANT_GNDR_CD], 
                            [Min_rpt_yr], [Max_rpt_yr], [Cal_Level], [Age_band_cd], [Raceethnicity_cd], [PS_Flag],
                            [STABILITY], [PREVALENCE],  [EVENTS_AMT], [Is_Smoothed],[no_data_id],
                            [TOTAL_LIVE_BIRTHS_AMT],  [TOTAL_LIVE_BIRTHS_WITH_BD_AMT] 
                            FROM tier4.T4_BirthDefects"))
# close connection
odbcClose(ch)

Check 1: Check Measure Number

#check1 <- 
  BDdata %>% 
  group_by(Measure_Num)%>%
  count(Measure_Num)
## # A tibble: 26 x 2
## # Groups:   Measure_Num [26]
##    Measure_Num      n
##    <fct>        <int>
##  1 M00237      158146
##  2 M00238      300167
##  3 M00239      158146
##  4 M00240      300167
##  5 M00241      151996
##  6 M00242      288242
##  7 M00243      158146
##  8 M00244      300167
##  9 M00245      153226
## 10 M00246      290627
## # ... with 16 more rows
#View(check1)

Check 2: Check Measure Type

#check2 <- 
  BDdata %>% 
  group_by(Measure_Type)%>%
  count(Measure_Type)
## # A tibble: 2 x 2
## # Groups:   Measure_Type [2]
##   Measure_Type       n
##   <fct>          <int>
## 1 BASIC        3748980
## 2 SMOOTH       1702840
#View(check2)

Check 3: Check Cal Level

#check3 <- 
  BDdata %>% 
  group_by(Cal_Level)%>%
  count(Cal_Level)
## # A tibble: 12 x 2
## # Groups:   Cal_Level [12]
##    Cal_Level        n
##    <fct>        <int>
##  1 S_C_Y       499422
##  2 S_C_Y_A    1687824
##  3 S_C_Y_RE   1997688
##  4 S_C_Y_S     923586
##  5 S_Y           5528
##  6 S_Y_A        18672
##  7 S_Y_A_RE     74688
##  8 S_Y_A_RE_S  136768
##  9 S_Y_A_S      34192
## 10 S_Y_RE       22112
## 11 S_Y_RE_S     41072
## 12 S_Y_S        10268
#View(check3)

Check 4: Check Measure num by Cal Level

#check4 <- 
  BDdata %>% 
  group_by(Measure_Num, Cal_Level)%>%
  count(Cal_Level)
## # A tibble: 282 x 3
## # Groups:   Measure_Num, Cal_Level [282]
##    Measure_Num Cal_Level      n
##    <fct>       <fct>      <int>
##  1 M00237      S_C_Y      12911
##  2 M00237      S_C_Y_A    51644
##  3 M00237      S_C_Y_RE   51644
##  4 M00237      S_C_Y_S    25822
##  5 M00237      S_Y          215
##  6 M00237      S_Y_A        860
##  7 M00237      S_Y_A_RE    3440
##  8 M00237      S_Y_A_RE_S  6880
##  9 M00237      S_Y_A_S     1720
## 10 M00237      S_Y_RE       860
## # ... with 272 more rows
#View(check4)

Check 5: Check Cal Level by age band

#check5 <- 
  BDdata %>% 
  group_by(Cal_Level, Age_band_cd)%>%
  count(Age_band_cd)
## # A tibble: 27 x 3
## # Groups:   Cal_Level, Age_band_cd [27]
##    Cal_Level Age_band_cd       n
##    <fct>           <int>   <int>
##  1 S_C_Y              NA  499422
##  2 S_C_Y_A             1  421956
##  3 S_C_Y_A             2  421956
##  4 S_C_Y_A             3  421956
##  5 S_C_Y_A             4  421956
##  6 S_C_Y_RE           NA 1997688
##  7 S_C_Y_S            NA  923586
##  8 S_Y                NA    5528
##  9 S_Y_A               1    4668
## 10 S_Y_A               2    4668
## # ... with 17 more rows
#View(check5)

Check 6: Check Cal Level by race ethnicity

check6 <- 
  BDdata %>% 
  group_by(Cal_Level, Raceethnicity_cd)%>%
  count(Raceethnicity_cd)
## Warning: Factor `Raceethnicity_cd` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Raceethnicity_cd` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Raceethnicity_cd` contains implicit NA, consider using
## `forcats::fct_explicit_na`
#View(check6)
head(check6)
## Warning: Factor `Raceethnicity_cd` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Raceethnicity_cd` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 6 x 3
## # Groups:   Cal_Level, Raceethnicity_cd [6]
##   Cal_Level Raceethnicity_cd       n
##   <fct>     <fct>              <int>
## 1 S_C_Y     <NA>              499422
## 2 S_C_Y_A   <NA>             1687824
## 3 S_C_Y_RE  B                 499422
## 4 S_C_Y_RE  H                 499422
## 5 S_C_Y_RE  O                 499422
## 6 S_C_Y_RE  W                 499422

Check 7: Check Cal Level by ps flag

#check7 <- 
  BDdata %>% 
  group_by(Cal_Level, PS_Flag)%>%
  count(PS_Flag)
## # A tibble: 16 x 3
## # Groups:   Cal_Level, PS_Flag [16]
##    Cal_Level  PS_Flag       n
##    <fct>        <int>   <int>
##  1 S_C_Y            0  332662
##  2 S_C_Y            1  166760
##  3 S_C_Y_A          0 1283500
##  4 S_C_Y_A          1  404324
##  5 S_C_Y_RE         0 1598096
##  6 S_C_Y_RE         1  399592
##  7 S_C_Y_S          0  624494
##  8 S_C_Y_S          1  299092
##  9 S_Y              0    5528
## 10 S_Y_A            0   18672
## 11 S_Y_A_RE         0   74688
## 12 S_Y_A_RE_S       0  136768
## 13 S_Y_A_S          0   34192
## 14 S_Y_RE           0   22112
## 15 S_Y_RE_S         0   41072
## 16 S_Y_S            0   10268
#View(check7)

Check 8: Check county and state

length(which(is.na(BDdata$County_cd)))
## [1] 343300
#343,300
  
#check8 <- 
  BDdata %>% 
  filter(is.na(County_cd))%>%
  group_by(State_CD, County_cd)%>%
  count(County_cd)
## # A tibble: 22 x 3
## # Groups:   State_CD, County_cd [22]
##    State_CD County_cd     n
##       <int>     <int> <int>
##  1        4        NA 15600
##  2        8        NA 20930
##  3        9        NA  1610
##  4       12        NA 24150
##  5       19        NA 17710
##  6       20        NA 11530
##  7       21        NA 14490
##  8       23        NA 17460
##  9       24        NA  3220
## 10       25        NA 20930
## # ... with 12 more rows
#View(check8)

Check 9: Check Date Range By State

#check9 <- 
  BDdata %>% 
  group_by(State_CD, Min_rpt_yr, Max_rpt_yr)%>%
  count((Max_rpt_yr-Min_rpt_yr))
## # A tibble: 242 x 5
## # Groups:   State_CD, Min_rpt_yr, Max_rpt_yr [242]
##    State_CD Min_rpt_yr Max_rpt_yr `(Max_rpt_yr - Min_rpt_yr)`     n
##       <int>      <int>      <int>                       <int> <int>
##  1        4       2003       2007                           4  7140
##  2        4       2004       2008                           4  7140
##  3        4       2005       2009                           4  7140
##  4        4       2006       2010                           4  7140
##  5        4       2007       2011                           4  7140
##  6        4       2008       2012                           4  7140
##  7        4       2009       2013                           4  7140
##  8        4       2010       2014                           4  7140
##  9        4       2011       2015                           4  7140
## 10        4       2012       2016                           4  7140
## # ... with 232 more rows
#View(check9)

Check 10: Skip

Check 11: Skip

Check 12: Check for CA, LA and MN State-Level Measures

list_of_states <- c("6", "22", "27")
check12 <- filter(BDdata, State_CD %in% list_of_states)
unique(check12$State_CD)
## [1] 22 27  6
#check12_actual <- 
  check12 %>% 
  group_by(State_CD, Cal_Level)%>%
  count(Cal_Level)
## # A tibble: 12 x 3
## # Groups:   State_CD, Cal_Level [12]
##    State_CD Cal_Level      n
##       <int> <fct>      <int>
##  1        6 S_C_Y      29406
##  2        6 S_C_Y_A    99528
##  3        6 S_C_Y_RE  117624
##  4        6 S_C_Y_S    54288
##  5       22 S_C_Y       8151
##  6       22 S_C_Y_A    27588
##  7       22 S_C_Y_RE   32604
##  8       22 S_C_Y_S    15048
##  9       27 S_C_Y      23751
## 10       27 S_C_Y_A    80388
## 11       27 S_C_Y_RE   95004
## 12       27 S_C_Y_S    43848
#View(check12_actual)

Check 13: Check rolling year count

length(which(BDdata$ROLLING_YEAR_COUNT != 5))
## [1] 0

Check 14: Check no data id

#Conditions for -1: birth defect count > 0 AND live birth count > 0 
check14_1 <- subset(BDdata, TOTAL_LIVE_BIRTHS_WITH_BD_AMT >0 & TOTAL_LIVE_BIRTHS_AMT>0)
length(which(check14_1$no_data_id != -1))
## [1] 301106
#Incorrect: n=301106 

#Conditions for 0: Birth defect count = 0 AND live birth count > 0
check14_0 <- subset(BDdata, TOTAL_LIVE_BIRTHS_WITH_BD_AMT ==0 & TOTAL_LIVE_BIRTHS_AMT>0)
length(which(check14_0$no_data_id != 0))
## [1] 0
#Correct

#Conditions for 2: NS race code where ethnicity = hispanic
check14_2 <- subset(BDdata, Raceethnicity_cd=="H")
NS <- subset(check14_2, no_data_id ==2)
length(which(check14_2$no_data_id == 2))
## [1] 45045
#Impossible to check in tier 4. No race variable to check NS with Raceethnicitiy code. 

#Conditions for 3: birth defect count > 0 and live birth count = 0
check14_3 <- subset(BDdata, TOTAL_LIVE_BIRTHS_WITH_BD_AMT >0 & TOTAL_LIVE_BIRTHS_AMT == 0)
length(which(check14_3$no_data_id != 3))
## [1] 0
#Correct

Check 15: Skip

Check 16: Suppression check

suppression <- subset(BDdata, EVENTS_AMT <6)
length(which(suppression$PS_Flag != 1))
## [1] 1475327

Check 17: Check VT nuance

VTnuance <- subset(BDdata, State_CD == 50 & Min_rpt_yr <2010)
length(which(VTnuance$Indicator_num== "I00010"))
## [1] 0
length(which(VTnuance$Indicator_num== "I00011"))
## [1] 0

Check 18: Check ME nuance

MEnuance <- subset(BDdata, State_CD == 23 & Min_rpt_yr <2008)
length(which(MEnuance$Indicator_num== "I00010"))
## [1] 0
length(which(MEnuance$Indicator_num== "I00011"))
## [1] 0

Check 19: Irrelevant

Check 20: Check KS nuance no records for Birth_def_cd=26 in 2012 and Birth_def_cd=31 in 2013

KSnuance <- subset(BDdata, State_CD == 20 & Min_rpt_yr == 2012)
length(which(KSnuance$Birth_def_CD== "26"))
## [1] 0
unique(KSnuance$Birth_def_CD) 
##  [1] 29 21 22 28 24 23 25 30 32 27
#31 is also missing

KSnuance2 <- subset(BDdata, State_CD == 20 & Min_rpt_yr == 2013)
length(which(KSnuance2$Birth_def_CD== "31"))
## [1] 0
#No data

Check 21: Check LA nuance

#Expanded variable not in T4. 
names(BDdata)
##  [1] "State_CD"                      "County_cd"                    
##  [3] "Birth_def_CD"                  "Indicator_num"                
##  [5] "Measure_Num"                   "Measure_Type"                 
##  [7] "RPT_YR"                        "ROLLING_YEAR_COUNT"           
##  [9] "INFANT_GNDR_CD"                "Min_rpt_yr"                   
## [11] "Max_rpt_yr"                    "Cal_Level"                    
## [13] "Age_band_cd"                   "Raceethnicity_cd"             
## [15] "PS_Flag"                       "STABILITY"                    
## [17] "PREVALENCE"                    "EVENTS_AMT"                   
## [19] "Is_Smoothed"                   "no_data_id"                   
## [21] "TOTAL_LIVE_BIRTHS_AMT"         "TOTAL_LIVE_BIRTHS_WITH_BD_AMT"