Question 2: Import and Data Preparation

code here

library(haven)

## Warning: package 'haven' was built under R version 4.5.3

KiGGS03_06 <- read_sav("KiGGS03_06.sav")
KiGGS03_06 <- read_sav("C:/Users/sharm/Downloads/KiGGS03_06.sav")
View(KiGGS03_06)

#Creating new dataframe(Replacing xyz with the matriculation number)
kiggs_4008198 <- KiGGS03_06
kiggs_4008198 <- kiggs_4008198[, c("E070M", "E070V", "E072", "E074",
                          "sex", "age2", "schichtz", "e065z", "HbA1c")]
                          
#Formating steps(The formatting steps clean and recode variables (e.g., labels, factors, missing values) to make the dataset ready for analysis)

kiggs_4008198$E070M    <- factor(kiggs_4008198$E070M, labels = c("yes, daily", "yes, occasionally", "no"))
kiggs_4008198$E070V    <- factor(kiggs_4008198$E070V, labels = c("yes, daily", "yes, occasionally", "no"))
kiggs_4008198$E072     <- factor(kiggs_4008198$E072, labels = c("yes, regularly", "yes, from time to time", "no, never"))
kiggs_4008198$E074     <- factor(kiggs_4008198$E074, labels = c("yes, regularly", "yes, from time to time", "no, never", "has not breastfed"))
kiggs_4008198$sex      <- factor(kiggs_4008198$sex, labels = c("boys", "girls"))
kiggs_4008198$age2     <- factor(kiggs_4008198$age2, labels = c("0-1y", "2-3y", "4-5y", "6-7y", "8-9y", "10-11y", "12-13y", "14-15y", "16-17y"))
kiggs_4008198$schichtz <- factor(kiggs_4008198$schichtz, labels = c("low social status", "medium social status", "high social status"))                 
#The formatting converts numeric variables into labeled categorical variables (factors) to improve interpretability and ensure correct statistical analysis.


save(kiggs_4008198, file = "kiggs_4008198.RData")

load("kiggs_4008198.RData")

# Checking structure
str(kiggs_4008198)

## tibble [17,640 × 9] (S3: tbl_df/tbl/data.frame)
##  $ E070M   : Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 3 1 1 3 3 3 3 3 3 ...
##  $ E070V   : Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 1 3 1 3 3 3 3 1 3 ...
##  $ E072    : Factor w/ 3 levels "yes, regularly",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ E074    : Factor w/ 4 levels "yes, regularly",..: 3 3 3 3 4 3 3 3 3 3 ...
##  $ sex     : Factor w/ 2 levels "boys","girls": 1 1 2 2 1 2 1 2 2 1 ...
##  $ age2    : Factor w/ 9 levels "0-1y","2-3y",..: 7 6 5 8 3 1 1 3 3 7 ...
##  $ schichtz: Factor w/ 3 levels "low social status",..: 2 1 2 2 2 3 2 3 2 1 ...
##  $ e065z   : dbl+lbl [1:17640] NA, 10, 12, NA, 11, 14,  9, 11, 12, NA, NA, 10, 12, ...
##    ..@ label        : chr "Schlaf pro Tag insges. (Stunden)"
##    ..@ format.spss  : chr "F2.0"
##    ..@ display_width: int 6
##    ..@ labels       : Named num [1:2] -8 -7
##    .. ..- attr(*, "names")= chr [1:2] "Missing - unplausibel" "Nicht zutreffend für das Alter"
##  $ HbA1c   : dbl+lbl [1:17640] 4.2, 5.4, 4.4, 3.6,  NA,  NA,  NA, 5.0,  NA, 4.5, 5....
##    ..@ label      : chr "Glykohämoglobin 1c [%]"
##    ..@ format.spss: chr "F8.1"
##    ..@ labels     : Named num -7
##    .. ..- attr(*, "names")= chr "Nicht zutreffend für das Alter"

# Viewing first rows
head(kiggs_4008198)

## # A tibble: 6 × 9
##   E070M      E070V      E072      E074          sex   age2  schichtz e065z HbA1c
##   <fct>      <fct>      <fct>     <fct>         <fct> <fct> <fct>    <dbl> <dbl>
## 1 no         no         no, never no, never     boys  12-1… medium … NA     4.2 
## 2 no         yes, daily no, never no, never     boys  10-1… low soc… 10     5.4 
## 3 yes, daily no         no, never no, never     girls 8-9y  medium … 12     4.4 
## 4 yes, daily yes, daily no, never no, never     girls 14-1… medium … NA     3.6 
## 5 no         no         no, never has not brea… boys  4-5y  medium … 11    NA   
## 6 no         no         no, never no, never     girls 0-1y  high so… 14    NA

##Question3: Data Formatting

str(kiggs_4008198[, c("E070M","E070V","E072","E074")])

## tibble [17,640 × 4] (S3: tbl_df/tbl/data.frame)
##  $ E070M: Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 3 1 1 3 3 3 3 3 3 ...
##  $ E070V: Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 1 3 1 3 3 3 3 1 3 ...
##  $ E072 : Factor w/ 3 levels "yes, regularly",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ E074 : Factor w/ 4 levels "yes, regularly",..: 3 3 3 3 4 3 3 3 3 3 ...

kiggs_4008198$E074[kiggs_4008198$E074 == "has not breastfed"] <- NA

kiggs_4008198$E074 <- droplevels(kiggs_4008198$E074)
table(kiggs_4008198$E074, useNA = "ifany")

## 
##         yes, regularly yes, from time to time              no, never 
##                    281                   1102                  12117 
##                   <NA> 
##                   4140

#Creating burdenS Coloum
kiggs_4008198$E070M <- as.numeric(kiggs_4008198$E070M)
kiggs_4008198$E070V <- as.numeric(kiggs_4008198$E070V)
kiggs_4008198$E072  <- as.numeric(kiggs_4008198$E072)
kiggs_4008198$E074  <- as.numeric(kiggs_4008198$E074)

kiggs_4008198$burdenS <- rowSums(
  kiggs_4008198[, c("E070M","E070V","E072","E074")],
  na.rm = TRUE
)
#Check
summary(kiggs_4008198$burdenS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   8.000  10.000   9.362  12.000  12.000

#The variable burdenS represents the total smoking exposure by summing the numeric factor levels of four smoking-related variables.

save(kiggs_4008198, file = "kiggs_4008198.RData")

#An alternative approach would be to create a binary variable (e.g., exposed vs. not exposed) or to use a weighted score instead of summing factor levels.

Question4: Descritptive analysis

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)

age_tab <- table(kiggs_4008198$age2)
sex_tab <- table(kiggs_4008198$sex)
burden_tab <- table(kiggs_4008198$burdenS)

knitr::kable(age_tab, col.names = c("Age group", "Frequency"))

Age group	Frequency
0-1y	1860
2-3y	1879
4-5y	1935
6-7y	2032
8-9y	2104
10-11y	2076
12-13y	2018
14-15y	1972
16-17y	1764

knitr::kable(sex_tab, col.names = c("Sex", "Frequency"))

Sex	Frequency
boys	8985
girls	8655

knitr::kable(burden_tab, col.names = c("BurdenS", "Frequency"))

BurdenS	Frequency
0	396
1	5
2	69
3	328
4	620
5	599
6	937
7	1178
8	1308
9	2061
10	2457
11	1083
12	6599

#Proportionate Table
# Age
table(kiggs_4008198$age2)

## 
##   0-1y   2-3y   4-5y   6-7y   8-9y 10-11y 12-13y 14-15y 16-17y 
##   1860   1879   1935   2032   2104   2076   2018   1972   1764

prop.table(table(kiggs_4008198$age2))

## 
##      0-1y      2-3y      4-5y      6-7y      8-9y    10-11y    12-13y    14-15y 
## 0.1054422 0.1065193 0.1096939 0.1151927 0.1192744 0.1176871 0.1143991 0.1117914 
##    16-17y 
## 0.1000000

age_tab <- table(kiggs_4008198$age2)

age_prop <- prop.table(age_tab)

age_table <- cbind(
  Frequency = age_tab,
  Proportion = round(age_prop, 3),
  Percentage = round(age_prop * 100, 2)
)

knitr::kable(age_table)

	Frequency	Proportion	Percentage
0-1y	1860	0.105	10.54
2-3y	1879	0.107	10.65
4-5y	1935	0.110	10.97
6-7y	2032	0.115	11.52
8-9y	2104	0.119	11.93
10-11y	2076	0.118	11.77
12-13y	2018	0.114	11.44
14-15y	1972	0.112	11.18
16-17y	1764	0.100	10.00

# Sex
table(kiggs_4008198$sex)

## 
##  boys girls 
##  8985  8655

prop.table(table(kiggs_4008198$sex))

## 
##      boys     girls 
## 0.5093537 0.4906463

sex_tab <- table(kiggs_4008198$sex)

sex_prop <- prop.table(sex_tab)

sex_tab <- cbind(
  Frequency = sex_tab,
  Proportion = round(sex_prop, 3),
  Percentage = round(sex_prop * 100, 2)
)

knitr::kable(sex_tab)

	Frequency	Proportion	Percentage
boys	8985	0.509	50.94
girls	8655	0.491	49.06

# burdenS
table(kiggs_4008198$burdenS)

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12 
##  396    5   69  328  620  599  937 1178 1308 2061 2457 1083 6599

prop.table(table(kiggs_4008198$burdenS))

## 
##            0            1            2            3            4            5 
## 0.0224489796 0.0002834467 0.0039115646 0.0185941043 0.0351473923 0.0339569161 
##            6            7            8            9           10           11 
## 0.0531179138 0.0667800454 0.0741496599 0.1168367347 0.1392857143 0.0613945578 
##           12 
## 0.3740929705

burden_tab <- table(kiggs_4008198$burdenS)

burden_prop <- prop.table(burden_tab)

burden_tab <- cbind(
  Frequency = burden_tab,
  Proportion = round(burden_prop, 3),
  Percentage = round(burden_prop * 100, 2)
)

knitr::kable(burden_tab)

	Frequency	Proportion	Percentage
0	396	0.022	2.24
1	5	0.000	0.03
2	69	0.004	0.39
3	328	0.019	1.86
4	620	0.035	3.51
5	599	0.034	3.40
6	937	0.053	5.31
7	1178	0.067	6.68
8	1308	0.074	7.41
9	2061	0.117	11.68
10	2457	0.139	13.93
11	1083	0.061	6.14
12	6599	0.374	37.41

#ONE COMBINED TABLE
age_df <- as.data.frame(age_tab)
sex_df <- as.data.frame(sex_tab)
burden_df <- as.data.frame(burden_tab)

knitr::kable(age_df)

Var1	Freq
0-1y	1860
2-3y	1879
4-5y	1935
6-7y	2032
8-9y	2104
10-11y	2076
12-13y	2018
14-15y	1972
16-17y	1764

knitr::kable(sex_df)

	Frequency	Proportion	Percentage
boys	8985	0.509	50.94
girls	8655	0.491	49.06

knitr::kable(burden_df)

	Frequency	Proportion	Percentage
0	396	0.022	2.24
1	5	0.000	0.03
2	69	0.004	0.39
3	328	0.019	1.86
4	620	0.035	3.51
5	599	0.034	3.40
6	937	0.053	5.31
7	1178	0.067	6.68
8	1308	0.074	7.41
9	2061	0.117	11.68
10	2457	0.139	13.93
11	1083	0.061	6.14
12	6599	0.374	37.41

# Missing values
sum(is.na(kiggs_4008198$age2))

## [1] 0

sum(is.na(kiggs_4008198$sex))

## [1] 0

sum(is.na(kiggs_4008198$burdenS))

## [1] 0

# Complete cases
sum(complete.cases(kiggs_4008198[, c("age2", "sex", "burdenS")]))

## [1] 17640

library(ggplot2)

library(ggplot2)

ggplot(kiggs_4008198, aes(x = burdenS)) +
  geom_histogram() +
  labs(title = "Distribution of smoking exposure (burdenS)",
       x = "Burden score",
       y = "Percentage")

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(kiggs_4008198, aes(x = factor(burdenS))) +
  geom_bar()

#The variable age2 shows the distribution of children across age groups, with most observations in the middle age categories in age group pf 8-9 with 11.93%. #The variable sex indicates the sample consists of both boys and girls with approximately balanced proportions i.e 52.94% in boys and 49.06% in females . #The variable burdenS reflects cumulative smoking exposure, where higher values indicate greater exposure, highest burden 12 with 37.41% among 6599 people. #Some missing values are present, reducing the number of complete observations available for analysis.

Final_Exam_4008198

Riya

2026-04-11

Question 2: Import and Data Preparation

code here

Question4: Descritptive analysis