Vamos a utilizar el Dataset “World cities” para explorar los datos con la siguiente hipótesis:
“Dado que el Reino Unido (UK) fue uno de los principales países que colonizaron los Estados Unidos (USA), y UK se encuentra en el lado este de USA, entonces hay más ciudades/poblados con nombres de ciudades de UK en la costa este de USA en comparación a la costa oeste”
datos <- "https://raw.githubusercontent.com/duberc/R-Python-SQL-en-R-Markdown/main/wordlcities.csv"
datos = read.csv(datos,header = TRUE,sep = ",")
estados <- "https://raw.githubusercontent.com/duberc/R-Python-SQL-en-R-Markdown/main/estadosUS.csv"
estados = read.csv(estados,header = TRUE,sep = ",")
# revisamos los nombres y tipos de datos
summary(datos)
Country City AccentCity Region Population Latitude Longitude
Length:5846 Length:5846 Length:5846 Length:5846 Min. : 216 Min. :17.97 Min. :-165.406
Class :character Class :character Class :character Class :character 1st Qu.: 11725 1st Qu.:36.20 1st Qu.: -92.399
Mode :character Mode :character Mode :character Mode :character Median : 19742 Median :40.85 Median : -80.084
Mean : 44974 Mean :41.24 Mean : -64.978
3rd Qu.: 37844 3rd Qu.:44.80 3rd Qu.: -5.049
Max. :8107916 Max. :71.29 Max. : 36.000
#resumen con estadistica descriptiva
glimpse(datos)
Rows: 5,846
Columns: 7
$ Country <chr> "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es...
$ City <chr> "abaran", "a coruna", "adeje", "adra", "aguilar de la frontera", "aguilas", "aguimes", "albacete", "albal", "albolote", "al...
$ AccentCity <chr> "Abarán", "A Coruña", "Adeje", "Adra", "Aguilar de la Frontera", "Ã\u0081guilas", "Agüimes", "Albacete", "Albal", "Albol...
$ Region <chr> "31", "58", "53", "51", "51", "31", "53", "54", "60", "51", "60", "51", "51", "29", "51", "52", "31", "51", "54", "29", "29...
$ Population <dbl> 13850, 236010, 22245, 23988, 13557, 30739, 21908, 155083, 13507, 14355, 19431, 10688, 58934, 195152, 21377, 13884, 37969, 1...
$ Latitude <dbl> 38.20551, 43.36661, 28.11984, 36.74740, 37.51445, 37.40598, 27.90505, 38.99585, 39.40000, 37.23006, 39.50000, 37.38856, 37....
$ Longitude <dbl> -1.399072, -8.406812, -16.725581, -3.015989, -4.656173, -1.585300, -15.445395, -1.857773, -0.416667, -3.655454, -0.350000, ...
# inspeccion rapida de la data
introduce(datos)
# revisamos como esta compuesto el dataset y vemos que no existen N/As
plot_intro(datos)
# lo comprobamos
plot_missing(datos)
# Pasar de R Dataframe a Python
df = r.datos
# eliminar NA's
df = df.dropna()
# eliminar duplicados
df = df.drop_duplicates()
# recuperar dataframe de python y pasarlo a R
datos = py$df
# comprobamos que eliminamos lo NA's
plot_missing(datos)
# Pasar un dataframe a BD para lectura SQL
copy_to(db, datos, overwrite = TRUE)
-- crear una salida SQL en el reporte para encontrar el nombre de paises segun ciudades de interes
select distinct Country, City
from datos where (City like "%Manchester%" or City like "%Texas%")
and Population > 0
# Creamos una columna nueva validando que el nombre de la ciudad en US exista en GB.
us$Comparar <- as.factor(ifelse(us$City %in% gb$City, 'yes','no'))
# Pasar un dataframe a BD para lectura SQL
copy_to(db, us,verwrite = TRUE)
copy_to(db, gb,verwrite = TRUE)
# Listado de tablas en la BD
dbListTables(db)
[1] "datos" "gb" "sqlite_stat1" "sqlite_stat4" "us"
# crear un parámetro para filtro en SQL
pop = 0
# pasar el nombre de una columna como parametro
filtro <- glue::glue_sql("Region", .con = db)
# crear 2 o mas parámetros para el uso de "IN"
ciudades <- c("New York", "Los Angeles", "Miami", "Anchorage")
# pasar la variable ciudades a class SQL
ciudades = glue::glue_sql("{ciudades*}", .con = db)
# Veamos
ciudades
<SQL> 'New York', 'Los Angeles', 'Miami', 'Anchorage'
-- ejecutar sentencia para usar los 3 par攼㸱metros
select distinct * from us where ?filtro = "FL" and AccentCity in (?ciudades) and Population >?pop order by Population desc limit 10
select * from datos where Country = "es" and AccentCity = "Adeje"
-- ejemplo delete
Delete from datos where Country = "es" and AccentCity = "Adeje"
-- ejemplo inser into
insert into datos values ("es","adeje","Adeje","53","22245","28.11984","-16.72558")
-- validar e insert
select * from datos where Country = "es" and AccentCity = "Adeje"
-- buscar todas la ciudades de us que tengan nombres de ciudades de gb
select distinct * from us where City in (select City from gb) and Population >?pop order by Population desc
-- Validar en US si una Ciudad - Region se repite
select count(*) as cantidad, df1.City, df1.Region, df1.Population, df1.Country
from us as df1 where df1.city in (select City from gb) and df1.Population >0
group by df1.City, df1.Region, df1.Population having cantidad>1 order by df1.Population desc
-- Ciudades mas importante Costa Este que no tengan nombre exacto en gb
select Region, City, Population
from us where City not in (select Distinct City from gb) and Location = 'Costa Este'
order by Population desc limit 10
-- Ciudades mas importante Costa Oeste que no tengan nombre exacto en gb
select Region, City, Population
from us where City not in (select Distinct City from gb) and Location = 'Costa Oeste'
order by Population desc limit 10
-- Ciudades mas importante de ambas costas que no existen explicitamente en gb
select City, Population, Location, Comparar
from us where Comparar ='no' and Location like "Costa%"
order by Population desc limit 20
-- Ciudades mas importante de ambas costas que no existen explicitamente en gb
select City, Population, Location, Comparar
from us where Comparar = 'no' and Location like "Costa%"
order by Population desc limit 20
# agregar el df test a la bd
copy_to(db, test, overwrite = TRUE)
-- validar la tabla
select * from test
-- hacer un split de las ciudades sin resultados en gb
select SUBSTRING(City, CHARINDEX(' ', City) + 1, length(City) - CHARINDEX(' ', City)) as City
from test
union
select SUBSTRING(City, 1, CHARINDEX(' ', City) - 1) as City
from test
-- guardar como table
select SUBSTRING(City, CHARINDEX(' ', City) + 1, length(City) - CHARINDEX(' ', City)) as City
from test
union
select SUBSTRING(City, 1, CHARINDEX(' ', City) - 1) as City
from test
# agregar df test1 a la tabla
copy_to(db, test1)
-- buscar por string en gb para ver coincidencias
select City,Population from gb where City in (select City from test1) order by Population
-- Dado que New York viene de "York" en GB actualizamos el registro
update us set Comparar = "yes"
where City = "new york"
-- validamos
select * from us
where City = "new york"
-- crear una salida SQL como dataframe R usando "output.var"
SELECT * FROM us where Comparar = "yes" order by Population desc
| Lugar | Ctd |
|---|---|
| Costa Este | 103 |
| Costa Oeste | 12 |
| Medio Oeste | 33 |
| Noroeste | 1 |
| Oeste | 6 |
| Sureste | 13 |
| Suroeste | 9 |
Ver el df en Python
Region Country City ... Name Location Comparar
0 NY us new york ... New York Costa Este yes
1 TX us houston ... Texas Suroeste yes
2 MA us boston ... Massachusetts Costa Este yes
3 DC us washington ... District of Columbia Costa Este yes
4 NE us lincoln ... Nebraska Medio Oeste yes
.. ... ... ... ... ... ... ...
172 NH us chester ... New Hampshire Costa Este yes
173 VT us newport ... Vermont Costa Este yes
174 ID us preston ... Idaho Noroeste yes
175 NH us wakefield ... New Hampshire Costa Este yes
176 VT us manchester ... Vermont Costa Este yes
[177 rows x 10 columns]
describe() nos permite conocer datos generales del dataframe
Population Latitude Longitude
count 1.770000e+02 177.000000 177.000000
mean 9.617597e+04 40.142351 -83.318123
std 6.276584e+05 3.686483 14.688758
min 2.160000e+02 26.244167 -124.052222
25% 1.169900e+04 38.533889 -89.519167
50% 1.943200e+04 41.159444 -77.441944
75% 3.534900e+04 42.595000 -71.688611
max 8.107916e+06 46.975556 -68.505556
Ejemplo de grafico en Python
[<matplotlib.lines.Line2D object at 0x000002749A4963C8>]
summary(py$df)
Region Country City AccentCity Population Latitude Longitude
Length:177 Length:177 Length:177 Length:177 Min. : 216 Min. :26.24 Min. :-124.05
Class :character Class :character Class :character Class :character 1st Qu.: 11699 1st Qu.:38.53 1st Qu.: -89.52
Mode :character Mode :character Mode :character Mode :character Median : 19432 Median :41.16 Median : -77.44
Mean : 96176 Mean :40.14 Mean : -83.32
3rd Qu.: 35349 3rd Qu.:42.59 3rd Qu.: -71.69
Max. :8107916 Max. :46.98 Max. : -68.51
Name Location Comparar
Length:177 Length:177 Length:177
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
# ver las ciudades en un mapa
us %>%
filter(Comparar=="yes") %>%
select(AccentCity,Latitude, Longitude) %>%
leaflet( width = 900) %>%
addTiles()%>%
addMarkers(clusterOptions = markerClusterOptions(),popup = ~htmlEscape(AccentCity))
NA