#install.packages("sqldf")
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.2.2
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
data1 <- read.csv("quiz2.csv",header = TRUE,sep = ",")
summary(data1)
## subregion region pop income
## Length:29 Length:29 Min. : 2311 Min. :26784
## Class :character Class :character 1st Qu.: 19005 1st Qu.:37970
## Mode :character Mode :character Median : 32122 Median :41595
## Mean : 135940 Mean :43858
## 3rd Qu.: 101482 3rd Qu.:47469
## Max. :1554720 Max. :70821
## ipaddr ufo2010 infections
## Min. : 637 Min. : 0.00 Min. : 39
## 1st Qu.: 12294 1st Qu.: 0.00 1st Qu.: 123
## Median : 30418 Median : 2.00 Median : 245
## Mean : 440130 Mean : 16.66 Mean :1117
## 3rd Qu.: 102104 3rd Qu.: 9.00 3rd Qu.: 672
## Max. :5394949 Max. :169.00 Max. :6781
sqldf("SELECT * FROM data1")
sqldf('SELECT * FROM data1 LIMIT 10')
sqldf("SELECT * FROM data1 ORDER BY infections DESC LIMIT 5")
sqldf("SELECT * FROM data1 ORDER BY ipaddr DESC LIMIT 7")
sqldf("SELECT * FROM data1 ORDER BY ufo2010 DESC LIMIT 7")
sqldf("SELECT * FROM data1 ORDER BY infections DESC LIMIT 7")
sqldf('SELECT region,infections FROM data1 WHERE infections > 1000 order by 2 desc')
sqldf('SELECT region,income,pop FROM data1 WHERE income > 50000 order by 2 desc')
sqldf('SELECT * FROM data1 WHERE (income > 50000 AND infections > 2000) OR pop > 100000')
sqldf('SELECT * FROM data1 WHERE (income > 30000 AND infections > 1000) OR pop > 50000')
query that returns every region and population with a number of infections between 1000 and 7000; in 7 regions the number of infections range from 1000 - 7000
sqldf('SELECT region,pop FROM data1 WHERE infections between 1000 and 7000')
create a query that returns all data for all records with subregion starting with letter ‘a’ 29 rows of data returned with subregion starting with letter ‘a’ this query can be useful to analyze subregions by alphabetical order (don’t see it to be to practical, but can’t think of any practical case either)
sqldf('SELECT * FROM data1 WHERE subregion LIKE "A%"')
sqldf("SELECT AVG(infections) FROM data1")
getwd()
## [1] "C:/1rdata"
data2 <- read.csv("finalexam.csv",header = TRUE,sep = ",")
summary(data2)
## region pop income ipaddr
## Length:29 Min. : 2311 Min. :26784 Min. : 637
## Class :character 1st Qu.: 19005 1st Qu.:37970 1st Qu.: 12294
## Mode :character Median : 32122 Median :41595 Median : 30418
## Mean : 135940 Mean :43858 Mean : 440130
## 3rd Qu.: 101482 3rd Qu.:47469 3rd Qu.: 102104
## Max. :1554720 Max. :70821 Max. :5394949
## ufo2010 infections
## Min. : 0.00 Min. : 39
## 1st Qu.: 0.00 1st Qu.: 123
## Median : 2.00 Median : 245
## Mean : 16.66 Mean :1117
## 3rd Qu.: 9.00 3rd Qu.: 672
## Max. :169.00 Max. :6781
sqldf("SELECT * FROM data2")
sqldf('SELECT * FROM data2 WHERE (region == "East" AND infections > 1500)')
18-a. Region affects the number of infections
sqldf('SELECT region,infections FROM data2 order by 1 desc')
sqldf('SELECT region, round(AVG(infections)) from data2 group by Region order by 2 desc')
18-b. Population affects the number of infections
sqldf('SELECT pop,infections FROM data2 order by 1 desc') #displayed but not saved
z1 <- sqldf('SELECT pop,infections FROM data2 order by 1 desc') #no 'c' to display as DF and save as 'z1' (CORRECT APPROACH)
z1
z1 <- c(sqldf('SELECT pop,infections FROM data2 order by 1 desc')) #with 'c' to display as rows of data and save as 'z1' (WRONG APPROACH)
z1
## $pop
## [1] 1554720 459598 409061 305455 251417 162812 153920 101482 67197
## [10] 61912 37276 36745 34365 33341 32122 31459 28350 25581
## [19] 25101 22286 20679 19005 18675 16148 15927 7481 3915
## [28] 3911 2311
##
## $infections
## [1] 5689 3298 5023 672 290 279 234 2356 502 215 205 2389 126 2076 67
## [16] 52 54 123 245 116 120 6781 195 261 257 189 430 112 39
18-c.Income affects the number of infections
sqldf('SELECT income,infections FROM data2 order by 1 desc')
18-d. ufo2010 affects the number of infections
sqldf('SELECT ufo2010,infections FROM data2 order by 1 desc')
18-e. The different interactions of region and income, and income and population affects the number of infections.
sqldf('SELECT region,income,infections FROM data2 order by 1 desc')
sqldf('SELECT income,pop,infections FROM data2 order by 1 desc')