Download this R Markdown file, save it on your computer, and perform all the below tasks by inserting your answer in text or by inserting R chunks below. After you are done, upload this file with your solutions on Moodle.
1+2
## [1] 3
Answer # 1 (Result is Hidden)
1+1
Answer # 2 (Result with no echo)
## [1] 2
Load the dataset and answer the following questions:
#loading dataset
data_import <- read.csv(file = url("https://www.dropbox.com/s/tqrauwuxyi03kee/Pima_diabetes.csv?dl=1"))
# How many women have Glucose levels 0?
glucose_null <- sum(data_import$Glucose == 0, na.rm = TRUE)
print(paste("Women with Glucose level of 0 is ", glucose_null))
## [1] "Women with Glucose level of 0 is 5"
# How many women have Insulin levels 0?
insulin_0 <- sum(data_import$Insulin == 0, na.rm = TRUE)
print(paste("Women with Insulin level of 0 is:", insulin_0))
## [1] "Women with Insulin level of 0 is: 374"
# How many women have both Glucose levels as well as Insulin levels 0?
all_zero <- sum(data_import$Glucose == 0 & data_import$Insulin == 0, na.rm = TRUE)
print(paste("Women with both Glucose and Insulin levels 0:", all_zero))
## [1] "Women with both Glucose and Insulin levels 0: 4"
# How many women have either Glucose levels or Insulin levels 0?
either_zero <- sum(data_import$Glucose == 0 | data_import$Insulin == 0, na.rm = TRUE)
print(paste("Women with either Glucose or Insulin levels 0:", either_zero))
## [1] "Women with either Glucose or Insulin levels 0: 375"
# How many women have missing BMI values?
bmi_miss <- sum(is.na(data_import$BMI))
print(paste("Women with missing BMI values are:", bmi_miss))
## [1] "Women with missing BMI values are: 0"
# How many women have BMI larger than 40?
bmi_above_40 <- sum(data_import$BMI > 40, na.rm = TRUE)
print(paste("Women with BMI larger than 40 has values:", bmi_above_40))
## [1] "Women with BMI larger than 40 has values: 96"
# Build a dataset that only includes the women with BMI>40
data_bmi_40 <- data_import[data_import$BMI > 40 & !is.na(data_import$BMI), ]
print(paste("Dataset with BMI>40 has", nrow(data_bmi_40), "findings"))
## [1] "Dataset with BMI>40 has 96 findings"
# Create a new variable named BMIOutlier, which has the value 0 if a women has BMI smaller or equal 50, and 1 if she has BMI higher than 50
data_import$BMIOutlier <- ifelse(data_import$BMI > 50, 1, 0)
print(paste("BMIOutlier variable created. Summary:"))
## [1] "BMIOutlier variable created. Summary:"
table(data_import$BMIOutlier, useNA = "always")
##
## 0 1 <NA>
## 760 8 0
Explore merging two datasets.
As a preparation, execute the following code to create different data frames
# import data
dat_ex3 <- read.csv(file = url("https://www.dropbox.com/s/tqrauwuxyi03kee/Pima_diabetes.csv?dl=1"))
# extract two smaller data sets
dat3_1 <- dat_ex3[1:100, 1:3]
dat3_2 <- dat_ex3[101:300, 1:3]
dat3_3 <- dat_ex3[1:100, 1:3]
dat3_4 <- dat_ex3[1:100, 4:6]
Task 3a: Think about how you can use the [.] operator to respectively piece dat3_1 and dat3_2, and dat3_3 and dat3_4 together into one data frame.
# For dat3_1 and dat3_2 (same columns, different rows) - use rbind()
combinedV <- rbind(dat3_1, dat3_2)
print(paste("combination dimensions:", dim(combinedV)[1], "rows,", dim(combinedV)[2], "columns"))
## [1] "combination dimensions: 300 rows, 3 columns"
# For dat3_3 and dat3_4 (same rows, different columns) - use cbind()
combinedH <- cbind(dat3_3, dat3_4)
print(paste("combination dimensions:", dim(combinedH)[1], "rows,", dim(combinedH)[2], "columns"))
## [1] "combination dimensions: 100 rows, 6 columns"
Task 3b: Explore the help of the merge() function in R in order to achieve the same goal of combining dat3_3 and dat3_4 together into one data frame. Hint: first create an ID variable in each data frame, then use this in the “by” argument.
# First create ID variables in each data frame
dat3_3$ID <- 1:nrow(dat3_3)
dat3_4$ID <- 1:nrow(dat3_4)
# Merge using the ID variable
merged_data <- merge(dat3_3, dat3_4, by = "ID")
print(paste("Merged data dimensions:", dim(merged_data)[1], "rows,", dim(merged_data)[2], "columns"))
## [1] "Merged data dimensions: 100 rows, 7 columns"
# Display first few rows to show successful merge
head(merged_data)
## ID Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 1 6 148 72 35 0 33.6
## 2 2 1 85 66 29 0 26.6
## 3 3 8 183 64 0 0 23.3
## 4 4 1 89 66 23 94 28.1
## 5 5 0 137 40 35 168 43.1
## 6 6 5 116 74 0 0 25.6
Create an Excel file with 5 observations of 2 variables. Variable 1 is just an ID variable (number 1-5 or character string etc.), and variable 2 is a date/time variable. Use variable 2 to describe the time (and day) you had lunch in the last 5 days. Then try to import the Excel file with both variables into R and/or transform the variables in R to Date or POSIXct variables.
# Load required package for reading Excel files
# install.packages("readxl") # Run this if package not installed
library(readxl)
# Create sample lunch data
lunch_samples <- data.frame(
ID = 1:5,
LunchT = c("2025-06-21 12:30:00",
"2025-06-22 13:00:00",
"2025-06-23 12:45:00",
"2025-06-24 13:15:00",
"2025-06-25 12:00:00")
)
print("Original lunch data:")
## [1] "Original lunch data:"
print(lunch_samples)
## ID LunchT
## 1 1 2025-06-21 12:30:00
## 2 2 2025-06-22 13:00:00
## 3 3 2025-06-23 12:45:00
## 4 4 2025-06-24 13:15:00
## 5 5 2025-06-25 12:00:00
# Convert to POSIXct date-time format
lunch_samples$LunchT <- as.POSIXct(lunch_samples$LunchT)
print("After converting to POSIXct:")
## [1] "After converting to POSIXct:"
print(lunch_samples)
## ID LunchT
## 1 1 2025-06-21 12:30:00
## 2 2 2025-06-22 13:00:00
## 3 3 2025-06-23 12:45:00
## 4 4 2025-06-24 13:15:00
## 5 5 2025-06-25 12:00:00
# Check the structure
str(lunch_samples)
## 'data.frame': 5 obs. of 2 variables:
## $ ID : int 1 2 3 4 5
## $ LunchT: POSIXct, format: "2025-06-21 12:30:00" "2025-06-22 13:00:00" ...
# Extract additional date/time components
lunch_samples$Hour_of_Lunch <- format(lunch_samples$LunchT, "%H")
lunch_samples$Day_of_Lunch <- format(lunch_samples$LunchT, "%A")
lunch_samples$Date_of_lunch <- format(lunch_samples$LunchT, "%Y-%m-%d")
print("Final dataset with extracted components:")
## [1] "Final dataset with extracted components:"
print(lunch_samples)
## ID LunchT Hour_of_Lunch Day_of_Lunch Date_of_lunch
## 1 1 2025-06-21 12:30:00 12 Saturday 2025-06-21
## 2 2 2025-06-22 13:00:00 13 Sunday 2025-06-22
## 3 3 2025-06-23 12:45:00 12 Monday 2025-06-23
## 4 4 2025-06-24 13:15:00 13 Tuesday 2025-06-24
## 5 5 2025-06-25 12:00:00 12 Wednesday 2025-06-25