Question 2: Import and Data Preparation

code here

library(haven)
## Warning: package 'haven' was built under R version 4.5.3
KiGGS03_06 <- read_sav("KiGGS03_06.sav")
KiGGS03_06 <- read_sav("C:/Users/sharm/Downloads/KiGGS03_06.sav")
View(KiGGS03_06)
#Creating new dataframe(Replacing xyz with the matriculation number)
kiggs_4008198 <- KiGGS03_06
kiggs_4008198 <- kiggs_4008198[, c("E070M", "E070V", "E072", "E074",
                          "sex", "age2", "schichtz", "e065z", "HbA1c")]
                          
#Formating steps(The formatting steps clean and recode variables (e.g., labels, factors, missing values) to make the dataset ready for analysis)

kiggs_4008198$E070M    <- factor(kiggs_4008198$E070M, labels = c("yes, daily", "yes, occasionally", "no"))
kiggs_4008198$E070V    <- factor(kiggs_4008198$E070V, labels = c("yes, daily", "yes, occasionally", "no"))
kiggs_4008198$E072     <- factor(kiggs_4008198$E072, labels = c("yes, regularly", "yes, from time to time", "no, never"))
kiggs_4008198$E074     <- factor(kiggs_4008198$E074, labels = c("yes, regularly", "yes, from time to time", "no, never", "has not breastfed"))
kiggs_4008198$sex      <- factor(kiggs_4008198$sex, labels = c("boys", "girls"))
kiggs_4008198$age2     <- factor(kiggs_4008198$age2, labels = c("0-1y", "2-3y", "4-5y", "6-7y", "8-9y", "10-11y", "12-13y", "14-15y", "16-17y"))
kiggs_4008198$schichtz <- factor(kiggs_4008198$schichtz, labels = c("low social status", "medium social status", "high social status"))                 
#The formatting converts numeric variables into labeled categorical variables (factors) to improve interpretability and ensure correct statistical analysis.


save(kiggs_4008198, file = "kiggs_4008198.RData")

load("kiggs_4008198.RData")

# Checking structure
str(kiggs_4008198)
## tibble [17,640 × 9] (S3: tbl_df/tbl/data.frame)
##  $ E070M   : Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 3 1 1 3 3 3 3 3 3 ...
##  $ E070V   : Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 1 3 1 3 3 3 3 1 3 ...
##  $ E072    : Factor w/ 3 levels "yes, regularly",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ E074    : Factor w/ 4 levels "yes, regularly",..: 3 3 3 3 4 3 3 3 3 3 ...
##  $ sex     : Factor w/ 2 levels "boys","girls": 1 1 2 2 1 2 1 2 2 1 ...
##  $ age2    : Factor w/ 9 levels "0-1y","2-3y",..: 7 6 5 8 3 1 1 3 3 7 ...
##  $ schichtz: Factor w/ 3 levels "low social status",..: 2 1 2 2 2 3 2 3 2 1 ...
##  $ e065z   : dbl+lbl [1:17640] NA, 10, 12, NA, 11, 14,  9, 11, 12, NA, NA, 10, 12, ...
##    ..@ label        : chr "Schlaf pro Tag insges. (Stunden)"
##    ..@ format.spss  : chr "F2.0"
##    ..@ display_width: int 6
##    ..@ labels       : Named num [1:2] -8 -7
##    .. ..- attr(*, "names")= chr [1:2] "Missing - unplausibel" "Nicht zutreffend für das Alter"
##  $ HbA1c   : dbl+lbl [1:17640] 4.2, 5.4, 4.4, 3.6,  NA,  NA,  NA, 5.0,  NA, 4.5, 5....
##    ..@ label      : chr "Glykohämoglobin 1c [%]"
##    ..@ format.spss: chr "F8.1"
##    ..@ labels     : Named num -7
##    .. ..- attr(*, "names")= chr "Nicht zutreffend für das Alter"
# Viewing first rows
head(kiggs_4008198)
## # A tibble: 6 × 9
##   E070M      E070V      E072      E074          sex   age2  schichtz e065z HbA1c
##   <fct>      <fct>      <fct>     <fct>         <fct> <fct> <fct>    <dbl> <dbl>
## 1 no         no         no, never no, never     boys  12-1… medium … NA     4.2 
## 2 no         yes, daily no, never no, never     boys  10-1… low soc… 10     5.4 
## 3 yes, daily no         no, never no, never     girls 8-9y  medium … 12     4.4 
## 4 yes, daily yes, daily no, never no, never     girls 14-1… medium … NA     3.6 
## 5 no         no         no, never has not brea… boys  4-5y  medium … 11    NA   
## 6 no         no         no, never no, never     girls 0-1y  high so… 14    NA

##Question3: Data Formatting

str(kiggs_4008198[, c("E070M","E070V","E072","E074")])
## tibble [17,640 × 4] (S3: tbl_df/tbl/data.frame)
##  $ E070M: Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 3 1 1 3 3 3 3 3 3 ...
##  $ E070V: Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 1 3 1 3 3 3 3 1 3 ...
##  $ E072 : Factor w/ 3 levels "yes, regularly",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ E074 : Factor w/ 4 levels "yes, regularly",..: 3 3 3 3 4 3 3 3 3 3 ...
kiggs_4008198$E074[kiggs_4008198$E074 == "has not breastfed"] <- NA

kiggs_4008198$E074 <- droplevels(kiggs_4008198$E074)
table(kiggs_4008198$E074, useNA = "ifany")
## 
##         yes, regularly yes, from time to time              no, never 
##                    281                   1102                  12117 
##                   <NA> 
##                   4140
#Creating burdenS Coloum
kiggs_4008198$E070M <- as.numeric(kiggs_4008198$E070M)
kiggs_4008198$E070V <- as.numeric(kiggs_4008198$E070V)
kiggs_4008198$E072  <- as.numeric(kiggs_4008198$E072)
kiggs_4008198$E074  <- as.numeric(kiggs_4008198$E074)

kiggs_4008198$burdenS <- rowSums(
  kiggs_4008198[, c("E070M","E070V","E072","E074")],
  na.rm = TRUE
)
#Check
summary(kiggs_4008198$burdenS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   8.000  10.000   9.362  12.000  12.000
#The variable burdenS represents the total smoking exposure by summing the numeric factor levels of four smoking-related variables.

save(kiggs_4008198, file = "kiggs_4008198.RData")

#An alternative approach would be to create a binary variable (e.g., exposed vs. not exposed) or to use a weighted score instead of summing factor levels.

Question4: Descritptive analysis

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)

age_tab <- table(kiggs_4008198$age2)
sex_tab <- table(kiggs_4008198$sex)
burden_tab <- table(kiggs_4008198$burdenS)

knitr::kable(age_tab, col.names = c("Age group", "Frequency"))
Age group Frequency
0-1y 1860
2-3y 1879
4-5y 1935
6-7y 2032
8-9y 2104
10-11y 2076
12-13y 2018
14-15y 1972
16-17y 1764
knitr::kable(sex_tab, col.names = c("Sex", "Frequency"))
Sex Frequency
boys 8985
girls 8655
knitr::kable(burden_tab, col.names = c("BurdenS", "Frequency"))
BurdenS Frequency
0 396
1 5
2 69
3 328
4 620
5 599
6 937
7 1178
8 1308
9 2061
10 2457
11 1083
12 6599
#Proportionate Table
# Age
table(kiggs_4008198$age2)
## 
##   0-1y   2-3y   4-5y   6-7y   8-9y 10-11y 12-13y 14-15y 16-17y 
##   1860   1879   1935   2032   2104   2076   2018   1972   1764
prop.table(table(kiggs_4008198$age2))
## 
##      0-1y      2-3y      4-5y      6-7y      8-9y    10-11y    12-13y    14-15y 
## 0.1054422 0.1065193 0.1096939 0.1151927 0.1192744 0.1176871 0.1143991 0.1117914 
##    16-17y 
## 0.1000000
age_tab <- table(kiggs_4008198$age2)

age_prop <- prop.table(age_tab)

age_table <- cbind(
  Frequency = age_tab,
  Proportion = round(age_prop, 3),
  Percentage = round(age_prop * 100, 2)
)

knitr::kable(age_table)
Frequency Proportion Percentage
0-1y 1860 0.105 10.54
2-3y 1879 0.107 10.65
4-5y 1935 0.110 10.97
6-7y 2032 0.115 11.52
8-9y 2104 0.119 11.93
10-11y 2076 0.118 11.77
12-13y 2018 0.114 11.44
14-15y 1972 0.112 11.18
16-17y 1764 0.100 10.00
# Sex
table(kiggs_4008198$sex)
## 
##  boys girls 
##  8985  8655
prop.table(table(kiggs_4008198$sex))
## 
##      boys     girls 
## 0.5093537 0.4906463
sex_tab <- table(kiggs_4008198$sex)

sex_prop <- prop.table(sex_tab)

sex_tab <- cbind(
  Frequency = sex_tab,
  Proportion = round(sex_prop, 3),
  Percentage = round(sex_prop * 100, 2)
)

knitr::kable(sex_tab)
Frequency Proportion Percentage
boys 8985 0.509 50.94
girls 8655 0.491 49.06
# burdenS
table(kiggs_4008198$burdenS)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12 
##  396    5   69  328  620  599  937 1178 1308 2061 2457 1083 6599
prop.table(table(kiggs_4008198$burdenS))
## 
##            0            1            2            3            4            5 
## 0.0224489796 0.0002834467 0.0039115646 0.0185941043 0.0351473923 0.0339569161 
##            6            7            8            9           10           11 
## 0.0531179138 0.0667800454 0.0741496599 0.1168367347 0.1392857143 0.0613945578 
##           12 
## 0.3740929705
burden_tab <- table(kiggs_4008198$burdenS)

burden_prop <- prop.table(burden_tab)

burden_tab <- cbind(
  Frequency = burden_tab,
  Proportion = round(burden_prop, 3),
  Percentage = round(burden_prop * 100, 2)
)

knitr::kable(burden_tab)
Frequency Proportion Percentage
0 396 0.022 2.24
1 5 0.000 0.03
2 69 0.004 0.39
3 328 0.019 1.86
4 620 0.035 3.51
5 599 0.034 3.40
6 937 0.053 5.31
7 1178 0.067 6.68
8 1308 0.074 7.41
9 2061 0.117 11.68
10 2457 0.139 13.93
11 1083 0.061 6.14
12 6599 0.374 37.41
#ONE COMBINED TABLE
age_df <- as.data.frame(age_tab)
sex_df <- as.data.frame(sex_tab)
burden_df <- as.data.frame(burden_tab)

knitr::kable(age_df)
Var1 Freq
0-1y 1860
2-3y 1879
4-5y 1935
6-7y 2032
8-9y 2104
10-11y 2076
12-13y 2018
14-15y 1972
16-17y 1764
knitr::kable(sex_df)
Frequency Proportion Percentage
boys 8985 0.509 50.94
girls 8655 0.491 49.06
knitr::kable(burden_df)
Frequency Proportion Percentage
0 396 0.022 2.24
1 5 0.000 0.03
2 69 0.004 0.39
3 328 0.019 1.86
4 620 0.035 3.51
5 599 0.034 3.40
6 937 0.053 5.31
7 1178 0.067 6.68
8 1308 0.074 7.41
9 2061 0.117 11.68
10 2457 0.139 13.93
11 1083 0.061 6.14
12 6599 0.374 37.41
# Missing values
sum(is.na(kiggs_4008198$age2))
## [1] 0
sum(is.na(kiggs_4008198$sex))
## [1] 0
sum(is.na(kiggs_4008198$burdenS))
## [1] 0
# Complete cases
sum(complete.cases(kiggs_4008198[, c("age2", "sex", "burdenS")]))
## [1] 17640
library(ggplot2)

library(ggplot2)

ggplot(kiggs_4008198, aes(x = burdenS)) +
  geom_histogram() +
  labs(title = "Distribution of smoking exposure (burdenS)",
       x = "Burden score",
       y = "Percentage")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(kiggs_4008198, aes(x = factor(burdenS))) +
  geom_bar()

#The variable age2 shows the distribution of children across age groups, with most observations in the middle age categories in age group pf 8-9 with 11.93%. #The variable sex indicates the sample consists of both boys and girls with approximately balanced proportions i.e 52.94% in boys and 49.06% in females . #The variable burdenS reflects cumulative smoking exposure, where higher values indicate greater exposure, highest burden 12 with 37.41% among 6599 people. #Some missing values are present, reducing the number of complete observations available for analysis.