code here
library(haven)
## Warning: package 'haven' was built under R version 4.5.3
KiGGS03_06 <- read_sav("KiGGS03_06.sav")
KiGGS03_06 <- read_sav("C:/Users/sharm/Downloads/KiGGS03_06.sav")
View(KiGGS03_06)
#Creating new dataframe(Replacing xyz with the matriculation number)
kiggs_4008198 <- KiGGS03_06
kiggs_4008198 <- kiggs_4008198[, c("E070M", "E070V", "E072", "E074",
"sex", "age2", "schichtz", "e065z", "HbA1c")]
#Formating steps(The formatting steps clean and recode variables (e.g., labels, factors, missing values) to make the dataset ready for analysis)
kiggs_4008198$E070M <- factor(kiggs_4008198$E070M, labels = c("yes, daily", "yes, occasionally", "no"))
kiggs_4008198$E070V <- factor(kiggs_4008198$E070V, labels = c("yes, daily", "yes, occasionally", "no"))
kiggs_4008198$E072 <- factor(kiggs_4008198$E072, labels = c("yes, regularly", "yes, from time to time", "no, never"))
kiggs_4008198$E074 <- factor(kiggs_4008198$E074, labels = c("yes, regularly", "yes, from time to time", "no, never", "has not breastfed"))
kiggs_4008198$sex <- factor(kiggs_4008198$sex, labels = c("boys", "girls"))
kiggs_4008198$age2 <- factor(kiggs_4008198$age2, labels = c("0-1y", "2-3y", "4-5y", "6-7y", "8-9y", "10-11y", "12-13y", "14-15y", "16-17y"))
kiggs_4008198$schichtz <- factor(kiggs_4008198$schichtz, labels = c("low social status", "medium social status", "high social status"))
#The formatting converts numeric variables into labeled categorical variables (factors) to improve interpretability and ensure correct statistical analysis.
save(kiggs_4008198, file = "kiggs_4008198.RData")
load("kiggs_4008198.RData")
# Checking structure
str(kiggs_4008198)
## tibble [17,640 × 9] (S3: tbl_df/tbl/data.frame)
## $ E070M : Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 3 1 1 3 3 3 3 3 3 ...
## $ E070V : Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 1 3 1 3 3 3 3 1 3 ...
## $ E072 : Factor w/ 3 levels "yes, regularly",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ E074 : Factor w/ 4 levels "yes, regularly",..: 3 3 3 3 4 3 3 3 3 3 ...
## $ sex : Factor w/ 2 levels "boys","girls": 1 1 2 2 1 2 1 2 2 1 ...
## $ age2 : Factor w/ 9 levels "0-1y","2-3y",..: 7 6 5 8 3 1 1 3 3 7 ...
## $ schichtz: Factor w/ 3 levels "low social status",..: 2 1 2 2 2 3 2 3 2 1 ...
## $ e065z : dbl+lbl [1:17640] NA, 10, 12, NA, 11, 14, 9, 11, 12, NA, NA, 10, 12, ...
## ..@ label : chr "Schlaf pro Tag insges. (Stunden)"
## ..@ format.spss : chr "F2.0"
## ..@ display_width: int 6
## ..@ labels : Named num [1:2] -8 -7
## .. ..- attr(*, "names")= chr [1:2] "Missing - unplausibel" "Nicht zutreffend für das Alter"
## $ HbA1c : dbl+lbl [1:17640] 4.2, 5.4, 4.4, 3.6, NA, NA, NA, 5.0, NA, 4.5, 5....
## ..@ label : chr "Glykohämoglobin 1c [%]"
## ..@ format.spss: chr "F8.1"
## ..@ labels : Named num -7
## .. ..- attr(*, "names")= chr "Nicht zutreffend für das Alter"
# Viewing first rows
head(kiggs_4008198)
## # A tibble: 6 × 9
## E070M E070V E072 E074 sex age2 schichtz e065z HbA1c
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <dbl> <dbl>
## 1 no no no, never no, never boys 12-1… medium … NA 4.2
## 2 no yes, daily no, never no, never boys 10-1… low soc… 10 5.4
## 3 yes, daily no no, never no, never girls 8-9y medium … 12 4.4
## 4 yes, daily yes, daily no, never no, never girls 14-1… medium … NA 3.6
## 5 no no no, never has not brea… boys 4-5y medium … 11 NA
## 6 no no no, never no, never girls 0-1y high so… 14 NA
##Question3: Data Formatting
str(kiggs_4008198[, c("E070M","E070V","E072","E074")])
## tibble [17,640 × 4] (S3: tbl_df/tbl/data.frame)
## $ E070M: Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 3 1 1 3 3 3 3 3 3 ...
## $ E070V: Factor w/ 3 levels "yes, daily","yes, occasionally",..: 3 1 3 1 3 3 3 3 1 3 ...
## $ E072 : Factor w/ 3 levels "yes, regularly",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ E074 : Factor w/ 4 levels "yes, regularly",..: 3 3 3 3 4 3 3 3 3 3 ...
kiggs_4008198$E074[kiggs_4008198$E074 == "has not breastfed"] <- NA
kiggs_4008198$E074 <- droplevels(kiggs_4008198$E074)
table(kiggs_4008198$E074, useNA = "ifany")
##
## yes, regularly yes, from time to time no, never
## 281 1102 12117
## <NA>
## 4140
#Creating burdenS Coloum
kiggs_4008198$E070M <- as.numeric(kiggs_4008198$E070M)
kiggs_4008198$E070V <- as.numeric(kiggs_4008198$E070V)
kiggs_4008198$E072 <- as.numeric(kiggs_4008198$E072)
kiggs_4008198$E074 <- as.numeric(kiggs_4008198$E074)
kiggs_4008198$burdenS <- rowSums(
kiggs_4008198[, c("E070M","E070V","E072","E074")],
na.rm = TRUE
)
#Check
summary(kiggs_4008198$burdenS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 8.000 10.000 9.362 12.000 12.000
#The variable burdenS represents the total smoking exposure by summing the numeric factor levels of four smoking-related variables.
save(kiggs_4008198, file = "kiggs_4008198.RData")
#An alternative approach would be to create a binary variable (e.g., exposed vs. not exposed) or to use a weighted score instead of summing factor levels.
Question4: Descritptive analysis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
age_tab <- table(kiggs_4008198$age2)
sex_tab <- table(kiggs_4008198$sex)
burden_tab <- table(kiggs_4008198$burdenS)
knitr::kable(age_tab, col.names = c("Age group", "Frequency"))
| 0-1y |
1860 |
| 2-3y |
1879 |
| 4-5y |
1935 |
| 6-7y |
2032 |
| 8-9y |
2104 |
| 10-11y |
2076 |
| 12-13y |
2018 |
| 14-15y |
1972 |
| 16-17y |
1764 |
knitr::kable(sex_tab, col.names = c("Sex", "Frequency"))
knitr::kable(burden_tab, col.names = c("BurdenS", "Frequency"))
| 0 |
396 |
| 1 |
5 |
| 2 |
69 |
| 3 |
328 |
| 4 |
620 |
| 5 |
599 |
| 6 |
937 |
| 7 |
1178 |
| 8 |
1308 |
| 9 |
2061 |
| 10 |
2457 |
| 11 |
1083 |
| 12 |
6599 |
#Proportionate Table
# Age
table(kiggs_4008198$age2)
##
## 0-1y 2-3y 4-5y 6-7y 8-9y 10-11y 12-13y 14-15y 16-17y
## 1860 1879 1935 2032 2104 2076 2018 1972 1764
prop.table(table(kiggs_4008198$age2))
##
## 0-1y 2-3y 4-5y 6-7y 8-9y 10-11y 12-13y 14-15y
## 0.1054422 0.1065193 0.1096939 0.1151927 0.1192744 0.1176871 0.1143991 0.1117914
## 16-17y
## 0.1000000
age_tab <- table(kiggs_4008198$age2)
age_prop <- prop.table(age_tab)
age_table <- cbind(
Frequency = age_tab,
Proportion = round(age_prop, 3),
Percentage = round(age_prop * 100, 2)
)
knitr::kable(age_table)
| 0-1y |
1860 |
0.105 |
10.54 |
| 2-3y |
1879 |
0.107 |
10.65 |
| 4-5y |
1935 |
0.110 |
10.97 |
| 6-7y |
2032 |
0.115 |
11.52 |
| 8-9y |
2104 |
0.119 |
11.93 |
| 10-11y |
2076 |
0.118 |
11.77 |
| 12-13y |
2018 |
0.114 |
11.44 |
| 14-15y |
1972 |
0.112 |
11.18 |
| 16-17y |
1764 |
0.100 |
10.00 |
# Sex
table(kiggs_4008198$sex)
##
## boys girls
## 8985 8655
prop.table(table(kiggs_4008198$sex))
##
## boys girls
## 0.5093537 0.4906463
sex_tab <- table(kiggs_4008198$sex)
sex_prop <- prop.table(sex_tab)
sex_tab <- cbind(
Frequency = sex_tab,
Proportion = round(sex_prop, 3),
Percentage = round(sex_prop * 100, 2)
)
knitr::kable(sex_tab)
| boys |
8985 |
0.509 |
50.94 |
| girls |
8655 |
0.491 |
49.06 |
# burdenS
table(kiggs_4008198$burdenS)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 396 5 69 328 620 599 937 1178 1308 2061 2457 1083 6599
prop.table(table(kiggs_4008198$burdenS))
##
## 0 1 2 3 4 5
## 0.0224489796 0.0002834467 0.0039115646 0.0185941043 0.0351473923 0.0339569161
## 6 7 8 9 10 11
## 0.0531179138 0.0667800454 0.0741496599 0.1168367347 0.1392857143 0.0613945578
## 12
## 0.3740929705
burden_tab <- table(kiggs_4008198$burdenS)
burden_prop <- prop.table(burden_tab)
burden_tab <- cbind(
Frequency = burden_tab,
Proportion = round(burden_prop, 3),
Percentage = round(burden_prop * 100, 2)
)
knitr::kable(burden_tab)
| 0 |
396 |
0.022 |
2.24 |
| 1 |
5 |
0.000 |
0.03 |
| 2 |
69 |
0.004 |
0.39 |
| 3 |
328 |
0.019 |
1.86 |
| 4 |
620 |
0.035 |
3.51 |
| 5 |
599 |
0.034 |
3.40 |
| 6 |
937 |
0.053 |
5.31 |
| 7 |
1178 |
0.067 |
6.68 |
| 8 |
1308 |
0.074 |
7.41 |
| 9 |
2061 |
0.117 |
11.68 |
| 10 |
2457 |
0.139 |
13.93 |
| 11 |
1083 |
0.061 |
6.14 |
| 12 |
6599 |
0.374 |
37.41 |
#ONE COMBINED TABLE
age_df <- as.data.frame(age_tab)
sex_df <- as.data.frame(sex_tab)
burden_df <- as.data.frame(burden_tab)
knitr::kable(age_df)
| 0-1y |
1860 |
| 2-3y |
1879 |
| 4-5y |
1935 |
| 6-7y |
2032 |
| 8-9y |
2104 |
| 10-11y |
2076 |
| 12-13y |
2018 |
| 14-15y |
1972 |
| 16-17y |
1764 |
knitr::kable(sex_df)
| boys |
8985 |
0.509 |
50.94 |
| girls |
8655 |
0.491 |
49.06 |
knitr::kable(burden_df)
| 0 |
396 |
0.022 |
2.24 |
| 1 |
5 |
0.000 |
0.03 |
| 2 |
69 |
0.004 |
0.39 |
| 3 |
328 |
0.019 |
1.86 |
| 4 |
620 |
0.035 |
3.51 |
| 5 |
599 |
0.034 |
3.40 |
| 6 |
937 |
0.053 |
5.31 |
| 7 |
1178 |
0.067 |
6.68 |
| 8 |
1308 |
0.074 |
7.41 |
| 9 |
2061 |
0.117 |
11.68 |
| 10 |
2457 |
0.139 |
13.93 |
| 11 |
1083 |
0.061 |
6.14 |
| 12 |
6599 |
0.374 |
37.41 |
# Missing values
sum(is.na(kiggs_4008198$age2))
## [1] 0
sum(is.na(kiggs_4008198$sex))
## [1] 0
sum(is.na(kiggs_4008198$burdenS))
## [1] 0
# Complete cases
sum(complete.cases(kiggs_4008198[, c("age2", "sex", "burdenS")]))
## [1] 17640
library(ggplot2)
library(ggplot2)
ggplot(kiggs_4008198, aes(x = burdenS)) +
geom_histogram() +
labs(title = "Distribution of smoking exposure (burdenS)",
x = "Burden score",
y = "Percentage")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(kiggs_4008198, aes(x = factor(burdenS))) +
geom_bar()

#The variable age2 shows the distribution of children across age
groups, with most observations in the middle age categories in age group
pf 8-9 with 11.93%. #The variable sex indicates the sample consists of
both boys and girls with approximately balanced proportions i.e 52.94%
in boys and 49.06% in females . #The variable burdenS reflects
cumulative smoking exposure, where higher values indicate greater
exposure, highest burden 12 with 37.41% among 6599 people. #Some missing
values are present, reducing the number of complete observations
available for analysis.