Download this R Markdown file, save it on your computer, and perform all the below tasks by inserting your answer in text or by inserting R chunks below. After you are done, upload this file with your solutions on Moodle.

Exescise 1 Solution

  1. Create an R chunk here to insert R code. Add R code in this R chunk to perform a simple calculation (e.g. calculate the sum of 1 and 2).
1+2
## [1] 3
  1. Create an R chunk with a basic calculation (e.g. 1+1). Try out the different ways how to include this in the knitted report.

Answer # 1 (Result is Hidden)

1+1

Answer # 2 (Result with no echo)

## [1] 2
  1. Knit this Rmd file to html and to pdf.

Exercise 2: Manipulating variables and data frames

Load the dataset and answer the following questions:

#loading dataset 
data_import <- read.csv(file = url("https://www.dropbox.com/s/tqrauwuxyi03kee/Pima_diabetes.csv?dl=1"))
# How many women have Glucose levels 0?
glucose_null <- sum(data_import$Glucose == 0, na.rm = TRUE)
print(paste("Women with Glucose level of 0 is ", glucose_null))
## [1] "Women with Glucose level of 0 is  5"
# How many women have Insulin levels 0?
insulin_0 <- sum(data_import$Insulin == 0, na.rm = TRUE)
print(paste("Women with Insulin level of 0 is:", insulin_0))
## [1] "Women with Insulin level of 0 is: 374"
# How many women have both Glucose levels as well as Insulin levels 0?
all_zero <- sum(data_import$Glucose == 0 & data_import$Insulin == 0, na.rm = TRUE)
print(paste("Women with both Glucose and Insulin levels 0:", all_zero))
## [1] "Women with both Glucose and Insulin levels 0: 4"
# How many women have either Glucose levels or Insulin levels 0?
either_zero <- sum(data_import$Glucose == 0 | data_import$Insulin == 0, na.rm = TRUE)
print(paste("Women with either Glucose or Insulin levels 0:", either_zero))
## [1] "Women with either Glucose or Insulin levels 0: 375"
# How many women have missing BMI values?
bmi_miss <- sum(is.na(data_import$BMI))
print(paste("Women with missing BMI values are:", bmi_miss))
## [1] "Women with missing BMI values are: 0"
# How many women have BMI larger than 40?
bmi_above_40 <- sum(data_import$BMI > 40, na.rm = TRUE)
print(paste("Women with BMI larger than 40 has values:", bmi_above_40))
## [1] "Women with BMI larger than 40 has values: 96"
# Build a dataset that only includes the women with BMI>40
data_bmi_40 <- data_import[data_import$BMI > 40 & !is.na(data_import$BMI), ]
print(paste("Dataset with BMI>40 has", nrow(data_bmi_40), "findings"))
## [1] "Dataset with BMI>40 has 96 findings"
# Create a new variable named BMIOutlier, which has the value 0 if a women has BMI smaller or equal 50, and 1 if she has BMI higher than 50
data_import$BMIOutlier <- ifelse(data_import$BMI > 50, 1, 0)
print(paste("BMIOutlier variable created. Summary:"))
## [1] "BMIOutlier variable created. Summary:"
table(data_import$BMIOutlier, useNA = "always")
## 
##    0    1 <NA> 
##  760    8    0

Exercise 3 (optional)

Explore merging two datasets.

As a preparation, execute the following code to create different data frames

# import data
dat_ex3 <- read.csv(file = url("https://www.dropbox.com/s/tqrauwuxyi03kee/Pima_diabetes.csv?dl=1"))

# extract two smaller data sets
dat3_1 <- dat_ex3[1:100, 1:3]
dat3_2 <- dat_ex3[101:300, 1:3]

dat3_3 <- dat_ex3[1:100, 1:3]
dat3_4 <- dat_ex3[1:100, 4:6]

Task 3a: Think about how you can use the [.] operator to respectively piece dat3_1 and dat3_2, and dat3_3 and dat3_4 together into one data frame.

# For dat3_1 and dat3_2 (same columns, different rows) - use rbind()
combinedV <- rbind(dat3_1, dat3_2)
print(paste("combination dimensions:", dim(combinedV)[1], "rows,", dim(combinedV)[2], "columns"))
## [1] "combination dimensions: 300 rows, 3 columns"
# For dat3_3 and dat3_4 (same rows, different columns) - use cbind()
combinedH <- cbind(dat3_3, dat3_4)
print(paste("combination dimensions:", dim(combinedH)[1], "rows,", dim(combinedH)[2], "columns"))
## [1] "combination dimensions: 100 rows, 6 columns"

Task 3b: Explore the help of the merge() function in R in order to achieve the same goal of combining dat3_3 and dat3_4 together into one data frame. Hint: first create an ID variable in each data frame, then use this in the “by” argument.

# First create ID variables in each data frame
dat3_3$ID <- 1:nrow(dat3_3)
dat3_4$ID <- 1:nrow(dat3_4)

# Merge using the ID variable
merged_data <- merge(dat3_3, dat3_4, by = "ID")
print(paste("Merged data dimensions:", dim(merged_data)[1], "rows,", dim(merged_data)[2], "columns"))
## [1] "Merged data dimensions: 100 rows, 7 columns"
# Display first few rows to show successful merge
head(merged_data)
##   ID Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1  1           6     148            72            35       0 33.6
## 2  2           1      85            66            29       0 26.6
## 3  3           8     183            64             0       0 23.3
## 4  4           1      89            66            23      94 28.1
## 5  5           0     137            40            35     168 43.1
## 6  6           5     116            74             0       0 25.6

Exercise 4 (optional): Times and dates in R

Create an Excel file with 5 observations of 2 variables. Variable 1 is just an ID variable (number 1-5 or character string etc.), and variable 2 is a date/time variable. Use variable 2 to describe the time (and day) you had lunch in the last 5 days. Then try to import the Excel file with both variables into R and/or transform the variables in R to Date or POSIXct variables.

# Load required package for reading Excel files
# install.packages("readxl")  # Run this if package not installed
library(readxl)



# Create sample lunch data
lunch_samples <- data.frame(
  ID = 1:5,
  LunchT = c("2025-06-21 12:30:00", 
                 "2025-06-22 13:00:00", 
                 "2025-06-23 12:45:00", 
                 "2025-06-24 13:15:00", 
                 "2025-06-25 12:00:00")
)

print("Original lunch data:")
## [1] "Original lunch data:"
print(lunch_samples)
##   ID              LunchT
## 1  1 2025-06-21 12:30:00
## 2  2 2025-06-22 13:00:00
## 3  3 2025-06-23 12:45:00
## 4  4 2025-06-24 13:15:00
## 5  5 2025-06-25 12:00:00
# Convert to POSIXct date-time format
lunch_samples$LunchT <- as.POSIXct(lunch_samples$LunchT)

print("After converting to POSIXct:")
## [1] "After converting to POSIXct:"
print(lunch_samples)
##   ID              LunchT
## 1  1 2025-06-21 12:30:00
## 2  2 2025-06-22 13:00:00
## 3  3 2025-06-23 12:45:00
## 4  4 2025-06-24 13:15:00
## 5  5 2025-06-25 12:00:00
# Check the structure
str(lunch_samples)
## 'data.frame':    5 obs. of  2 variables:
##  $ ID    : int  1 2 3 4 5
##  $ LunchT: POSIXct, format: "2025-06-21 12:30:00" "2025-06-22 13:00:00" ...
# Extract additional date/time components
lunch_samples$Hour_of_Lunch <- format(lunch_samples$LunchT, "%H")
lunch_samples$Day_of_Lunch <- format(lunch_samples$LunchT, "%A")
lunch_samples$Date_of_lunch <- format(lunch_samples$LunchT, "%Y-%m-%d")

print("Final dataset with extracted components:")
## [1] "Final dataset with extracted components:"
print(lunch_samples)
##   ID              LunchT Hour_of_Lunch Day_of_Lunch Date_of_lunch
## 1  1 2025-06-21 12:30:00            12     Saturday    2025-06-21
## 2  2 2025-06-22 13:00:00            13       Sunday    2025-06-22
## 3  3 2025-06-23 12:45:00            12       Monday    2025-06-23
## 4  4 2025-06-24 13:15:00            13      Tuesday    2025-06-24
## 5  5 2025-06-25 12:00:00            12    Wednesday    2025-06-25