# Las siguientes líneas de comando es para cuando trabajamos en internet o con Jupiter Notebooks
#import piplite
#await piplite.install(['pandas'])
#await piplite.install(['matplotlib'])
#If you run the lab locally using Anaconda, you can load the correct library and versions by uncommenting the following:
#install specific version of libraries used in lab
#! mamba install pandas==1.3.3
#! mamba install numpy=1.21.2
import pandas as pd
import matplotlib.pylab as plt
#This function will download the dataset into your browser
#from pyodide.http import pyfetch
#async def download(url, filename):
# response = await pyfetch(url)
# if response.status == 200:
# with open(filename, "wb") as f:
# f.write(await response.bytes())
import numpy as np
# replace "?" to NaN
df.replace("?", np.nan, inplace = True)
df.head(5)
## symboling normalized-losses make ... city-mpg highway-mpg price
## 0 3 NaN alfa-romero ... 21 27 13495
## 1 3 NaN alfa-romero ... 21 27 16500
## 2 1 NaN alfa-romero ... 19 26 16500
## 3 2 164 audi ... 24 30 13950
## 4 2 164 audi ... 18 22 17450
##
## [5 rows x 26 columns]
missing_data = df.isnull()
missing_data.head(5)
## symboling normalized-losses make ... city-mpg highway-mpg price
## 0 False True False ... False False False
## 1 False True False ... False False False
## 2 False True False ... False False False
## 3 False False False ... False False False
## 4 False False False ... False False False
##
## [5 rows x 26 columns]
for column in missing_data.columns.values.tolist():
print(column)
print (missing_data[column].value_counts())
print("")
## symboling
## False 205
## Name: symboling, dtype: int64
##
## normalized-losses
## False 164
## True 41
## Name: normalized-losses, dtype: int64
##
## make
## False 205
## Name: make, dtype: int64
##
## fuel-type
## False 205
## Name: fuel-type, dtype: int64
##
## aspiration
## False 205
## Name: aspiration, dtype: int64
##
## num-of-doors
## False 203
## True 2
## Name: num-of-doors, dtype: int64
##
## body-style
## False 205
## Name: body-style, dtype: int64
##
## drive-wheels
## False 205
## Name: drive-wheels, dtype: int64
##
## engine-location
## False 205
## Name: engine-location, dtype: int64
##
## wheel-base
## False 205
## Name: wheel-base, dtype: int64
##
## length
## False 205
## Name: length, dtype: int64
##
## width
## False 205
## Name: width, dtype: int64
##
## height
## False 205
## Name: height, dtype: int64
##
## curb-weight
## False 205
## Name: curb-weight, dtype: int64
##
## engine-type
## False 205
## Name: engine-type, dtype: int64
##
## num-of-cylinders
## False 205
## Name: num-of-cylinders, dtype: int64
##
## engine-size
## False 205
## Name: engine-size, dtype: int64
##
## fuel-system
## False 205
## Name: fuel-system, dtype: int64
##
## bore
## False 201
## True 4
## Name: bore, dtype: int64
##
## stroke
## False 201
## True 4
## Name: stroke, dtype: int64
##
## compression-ratio
## False 205
## Name: compression-ratio, dtype: int64
##
## horsepower
## False 203
## True 2
## Name: horsepower, dtype: int64
##
## peak-rpm
## False 203
## True 2
## Name: peak-rpm, dtype: int64
##
## city-mpg
## False 205
## Name: city-mpg, dtype: int64
##
## highway-mpg
## False 205
## Name: highway-mpg, dtype: int64
##
## price
## False 201
## True 4
## Name: price, dtype: int64
avg_norm_loss = df["normalized-losses"].astype("float").mean(axis=0)
print("Average of normalized-losses:", avg_norm_loss)
## Average of normalized-losses: 122.0
df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)
avg_bore=df['bore'].astype('float').mean(axis=0)
print("Average of bore:", avg_bore)
## Average of bore: 3.3297512437810957
df["bore"].replace(np.nan, avg_bore, inplace=True)
# Write your code below and press Shift+Enter to execute
avg_stroke=df['stroke'].astype('float').mean(axis=0)
print("Average of stroke:", avg_stroke)
## Average of stroke: 3.2554228855721337
df["stroke"].replace(np.nan, avg_stroke, inplace=True)
avg_horsepower = df['horsepower'].astype('float').mean(axis=0)
print("Average horsepower:", avg_horsepower)
## Average horsepower: 104.25615763546799
df['horsepower'].replace(np.nan, avg_horsepower, inplace=True)
avg_peakrpm=df['peak-rpm'].astype('float').mean(axis=0)
print("Average peak rpm:", avg_peakrpm)
## Average peak rpm: 5125.369458128079
df['peak-rpm'].replace(np.nan, avg_peakrpm, inplace=True)
df['num-of-doors'].value_counts()
## four 114
## two 89
## Name: num-of-doors, dtype: int64
df['num-of-doors'].value_counts().idxmax()
## 'four'
#replace the missing 'num-of-doors' values by the most frequent
df["num-of-doors"].replace(np.nan, "four", inplace=True)
# simply drop whole row with NaN in "price" column
df.dropna(subset=["price"], axis=0, inplace=True)
# reset index, because we droped two rows
df.reset_index(drop=True, inplace=True)
df.head()
## symboling normalized-losses make ... city-mpg highway-mpg price
## 0 3 122.0 alfa-romero ... 21 27 13495
## 1 3 122.0 alfa-romero ... 21 27 16500
## 2 1 122.0 alfa-romero ... 19 26 16500
## 3 2 164 audi ... 24 30 13950
## 4 2 164 audi ... 18 22 17450
##
## [5 rows x 26 columns]
df.dtypes
## symboling int64
## normalized-losses object
## make object
## fuel-type object
## aspiration object
## num-of-doors object
## body-style object
## drive-wheels object
## engine-location object
## wheel-base float64
## length float64
## width float64
## height float64
## curb-weight int64
## engine-type object
## num-of-cylinders object
## engine-size int64
## fuel-system object
## bore object
## stroke object
## compression-ratio float64
## horsepower object
## peak-rpm object
## city-mpg int64
## highway-mpg int64
## price object
## dtype: object
df[["bore", "stroke"]] = df[["bore", "stroke"]].astype("float")
df[["normalized-losses"]] = df[["normalized-losses"]].astype("int")
df[["price"]] = df[["price"]].astype("float")
df[["peak-rpm"]] = df[["peak-rpm"]].astype("float")
df.dtypes
## symboling int64
## normalized-losses int32
## make object
## fuel-type object
## aspiration object
## num-of-doors object
## body-style object
## drive-wheels object
## engine-location object
## wheel-base float64
## length float64
## width float64
## height float64
## curb-weight int64
## engine-type object
## num-of-cylinders object
## engine-size int64
## fuel-system object
## bore float64
## stroke float64
## compression-ratio float64
## horsepower object
## peak-rpm float64
## city-mpg int64
## highway-mpg int64
## price float64
## dtype: object
df.head()
## symboling normalized-losses make ... city-mpg highway-mpg price
## 0 3 122 alfa-romero ... 21 27 13495.0
## 1 3 122 alfa-romero ... 21 27 16500.0
## 2 1 122 alfa-romero ... 19 26 16500.0
## 3 2 164 audi ... 24 30 13950.0
## 4 2 164 audi ... 18 22 17450.0
##
## [5 rows x 26 columns]
# Convert mpg to L/100km by mathematical operation (235 divided by mpg)
df['city-L/100km'] = 235/df["city-mpg"]
# check your transformed data
df.head()
## symboling normalized-losses ... price city-L/100km
## 0 3 122 ... 13495.0 11.190476
## 1 3 122 ... 16500.0 11.190476
## 2 1 122 ... 16500.0 12.368421
## 3 2 164 ... 13950.0 9.791667
## 4 2 164 ... 17450.0 13.055556
##
## [5 rows x 27 columns]
# Write your code below and press Shift+Enter to execute
# Convert mpg to L/100km by mathematical operation (235 divided by mpg)
df['highway-L/100km'] = 235/df["highway-mpg"]
# check your transformed data
df.head()
## symboling normalized-losses ... city-L/100km highway-L/100km
## 0 3 122 ... 11.190476 8.703704
## 1 3 122 ... 11.190476 8.703704
## 2 1 122 ... 12.368421 9.038462
## 3 2 164 ... 9.791667 7.833333
## 4 2 164 ... 13.055556 10.681818
##
## [5 rows x 28 columns]
# Respuesta correcta es ésta ya que había que hacer todo en la misma columna y cambiar el nombre de la columan
# transform mpg to L/100km by mathematical operation (235 divided by mpg)
df["highway-mpg"] = 235/df["highway-mpg"]
# rename column name from "highway-mpg" to "highway-L/100km"
df.rename(columns={'highway-mpg':'highway-L/100km'}, inplace=True)
# check your transformed data
df.head()
## symboling normalized-losses ... city-L/100km highway-L/100km
## 0 3 122 ... 11.190476 8.703704
## 1 3 122 ... 11.190476 8.703704
## 2 1 122 ... 12.368421 9.038462
## 3 2 164 ... 9.791667 7.833333
## 4 2 164 ... 13.055556 10.681818
##
## [5 rows x 28 columns]
# replace (original value) by (original value)/(maximum value)
df['length'] = df['length']/df['length'].max()
df['width'] = df['width']/df['width'].max()
# Write your code below and press Shift+Enter to execute
# replace (original value) by (original value)/(maximum value)
df['height'] = df['height']/df['height'].max()
# show the scaled columns
df[["length","width","height"]].head()
## length width height
## 0 0.811148 0.890278 0.816054
## 1 0.811148 0.890278 0.816054
## 2 0.822681 0.909722 0.876254
## 3 0.848630 0.919444 0.908027
## 4 0.848630 0.922222 0.908027
df["horsepower"]=df["horsepower"].astype(int, copy=True)
#%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
plt.pyplot.hist(df["horsepower"])
# set x/y labels and plot title
## (array([44., 45., 48., 24., 14., 16., 5., 4., 0., 1.]), array([ 48. , 69.4, 90.8, 112.2, 133.6, 155. , 176.4, 197.8, 219.2,
## 240.6, 262. ]), <BarContainer object of 10 artists>)
plt.pyplot.xlabel("horsepower")
plt.pyplot.ylabel("count")
plt.pyplot.title("horsepower bins")
plt.pyplot.show()
bins = np.linspace(min(df["horsepower"]), max(df["horsepower"]), 4)
bins
## array([ 48. , 119.33333333, 190.66666667, 262. ])
group_names = ['Low', 'Medium', 'High']
df['horsepower-binned'] = pd.cut(df['horsepower'], bins, labels=group_names, include_lowest=True )
df[['horsepower','horsepower-binned']].head(20)
## horsepower horsepower-binned
## 0 111 Low
## 1 111 Low
## 2 154 Medium
## 3 102 Low
## 4 115 Low
## 5 110 Low
## 6 110 Low
## 7 110 Low
## 8 140 Medium
## 9 101 Low
## 10 101 Low
## 11 121 Medium
## 12 121 Medium
## 13 121 Medium
## 14 182 Medium
## 15 182 Medium
## 16 182 Medium
## 17 48 Low
## 18 70 Low
## 19 70 Low
df["horsepower-binned"].value_counts()
## Low 153
## Medium 43
## High 5
## Name: horsepower-binned, dtype: int64
#%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
pyplot.bar(group_names, df["horsepower-binned"].value_counts())
# set x/y labels and plot title
## <BarContainer object of 3 artists>
plt.pyplot.xlabel("horsepower")
plt.pyplot.ylabel("count")
plt.pyplot.title("horsepower bins")
plt.pyplot.show()
#%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
# draw historgram of attribute "horsepower" with bins = 3
plt.pyplot.hist(df["horsepower"], bins = 3)
# set x/y labels and plot title
## (array([153., 43., 5.]), array([ 48. , 119.33333333, 190.66666667, 262. ]), <BarContainer object of 3 artists>)
plt.pyplot.xlabel("horsepower")
plt.pyplot.ylabel("count")
plt.pyplot.title("horsepower bins")
plt.pyplot.show()
df.columns
## Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
## 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
## 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
## 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
## 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
## 'highway-L/100km', 'price', 'city-L/100km', 'highway-L/100km',
## 'horsepower-binned'],
## dtype='object')
dummy_variable_1 = pd.get_dummies(df["fuel-type"])
dummy_variable_1.head()
## diesel gas
## 0 0 1
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
dummy_variable_1.rename(columns={'gas':'fuel-type-gas', 'diesel':'fuel-type-diesel'}, inplace=True)
dummy_variable_1.head()
## fuel-type-diesel fuel-type-gas
## 0 0 1
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
# merge data frame "df" and "dummy_variable_1"
df = pd.concat([df, dummy_variable_1], axis=1)
# drop original column "fuel-type" from "df"
df.drop("fuel-type", axis = 1, inplace=True)
df.head()
## symboling normalized-losses ... fuel-type-diesel fuel-type-gas
## 0 3 122 ... 0 1
## 1 3 122 ... 0 1
## 2 1 122 ... 0 1
## 3 2 164 ... 0 1
## 4 2 164 ... 0 1
##
## [5 rows x 30 columns]
# Write your code below and press Shift+Enter to execute
# Get the indicator variables and assign it to data frame "dummy_variable_2":
dummy_variable_2 = pd.get_dummies(df["aspiration"])
dummy_variable_2.head()
## std turbo
## 0 1 0
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
#Changing the column names for clarity:
dummy_variable_2.rename(columns={'std':'aspiration-std', 'turbo':'aspiration-turbo'}, inplace=True)
dummy_variable_2.head()
## aspiration-std aspiration-turbo
## 0 1 0
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
# Write your code below and press Shift+Enter to execute
# merge data frame "df" and "dummy_variable_2"
df = pd.concat([df, dummy_variable_2], axis=1)
# drop original column "aspiration" from "df"
df.drop("aspiration", axis = 1, inplace=True)
# The last two columns are now the indicator variable representation of the fuel-type variable. They're all 0s and 1s now.
df.head()
## symboling normalized-losses ... aspiration-std aspiration-turbo
## 0 3 122 ... 1 0
## 1 3 122 ... 1 0
## 2 1 122 ... 1 0
## 3 2 164 ... 1 0
## 4 2 164 ... 1 0
##
## [5 rows x 31 columns]
df.to_csv('clean_df.csv')