import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Data source : https://open.canada.ca
raw_data = pd.read_csv("https://health-infobase.canada.ca/src/data/covidLive/covid19-download.csv")
# copy data to a new variable
df = raw_data.copy()
# check the first 5 rows of data
df.head()
## pruid prname ... avgdeaths_last7 avgratedeaths_last7
## 0 59 British Columbia ... 0.0 0.0
## 1 48 Alberta ... 0.0 0.0
## 2 47 Saskatchewan ... 0.0 0.0
## 3 46 Manitoba ... 0.0 0.0
## 4 35 Ontario ... 0.0 0.0
##
## [5 rows x 23 columns]
# check the statistical analysis for numerical columns
df.describe()
## pruid reporting_week ... avgdeaths_last7 avgratedeaths_last7
## count 1905.000000 1905.000000 ... 1905.000000 1778.000000
## mean 39.200000 24.803150 ... 6.289071 0.089927
## std 26.290483 14.668092 ... 18.005564 0.153713
## min 1.000000 1.000000 ... 0.000000 0.000000
## 25% 12.000000 12.000000 ... 0.000000 0.000000
## 50% 46.000000 23.000000 ... 0.140000 0.020000
## 75% 60.000000 37.000000 ... 3.430000 0.120000
## max 99.000000 53.000000 ... 160.430000 1.260000
##
## [8 rows x 20 columns]
# Drop pruid from the dataframe
df.drop(['pruid'], inplace = True, axis = 1)
df.describe()
## reporting_week reporting_year ... avgdeaths_last7 avgratedeaths_last7
## count 1905.000000 1905.000000 ... 1905.000000 1778.000000
## mean 24.803150 2020.818898 ... 6.289071 0.089927
## std 14.668092 0.747025 ... 18.005564 0.153713
## min 1.000000 2020.000000 ... 0.000000 0.000000
## 25% 12.000000 2020.000000 ... 0.000000 0.000000
## 50% 23.000000 2021.000000 ... 0.140000 0.020000
## 75% 37.000000 2021.000000 ... 3.430000 0.120000
## max 53.000000 2022.000000 ... 160.430000 1.260000
##
## [8 rows x 19 columns]