November, 2021 - François HU
Master of Science - EPITA
This lecture is available here: https://curiousml.github.io/
! cat data/defra_consumption.csv
;England;Wales;Scotland;N Ireland Cheese;105;103;103;66 Carcass meat;245;227;242;267 Other meat;685;803;750;586 Fish;147;160;122;93 Fats and oils;193;235;184;209 Sugars;156;175;147;139 Fresh potatoes;720;874;566;1033 Fresh Veg;253;265;171;143 Other Veg;488;570;418;355 Processed potatoes;198;203;220;187 Processed Veg;360;365;337;334 Fresh fruit;1102;1137;957;674 Cereals;1472;1582;1462;1494 Beverages;57;73;53;47 Soft drinks;1374;1256;1572;1506 Alcoholic drinks;375;475;458;135 Confectionery;54;64;62;41
import pandas as pd
consumption = pd.read_csv('data/defra_consumption.csv', sep=';', index_col=0)
consumption.head(10)
England | Wales | Scotland | N Ireland | |
---|---|---|---|---|
Cheese | 105 | 103 | 103 | 66 |
Carcass meat | 245 | 227 | 242 | 267 |
Other meat | 685 | 803 | 750 | 586 |
Fish | 147 | 160 | 122 | 93 |
Fats and oils | 193 | 235 | 184 | 209 |
Sugars | 156 | 175 | 147 | 139 |
Fresh potatoes | 720 | 874 | 566 | 1033 |
Fresh Veg | 253 | 265 | 171 | 143 |
Other Veg | 488 | 570 | 418 | 355 |
Processed potatoes | 198 | 203 | 220 | 187 |
print(consumption.shape)
print(consumption.dtypes)
(17, 4) England int64 Wales int64 Scotland int64 N Ireland int64 dtype: object
consumption.loc['Cheese':'Cereals':4, :'Wales']
England | Wales | |
---|---|---|
Cheese | 105 | 103 |
Fats and oils | 193 | 235 |
Other Veg | 488 | 570 |
Cereals | 1472 | 1582 |
consumption.plot(subplots=True, figsize=(14, 8)); # Columns vs index
import pandas as pd
import numpy as np
iris = pd.read_csv('data/Iris.csv', sep=',', index_col="Id")
iris.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]
iris.index = ["lab " + str(i) for i in range(len(iris))]
# generate iris_plus
countries = ["France", "Italy", "Spain", "China", "US", "Japan"]
probabilities = [0.35, 0.2, 0.05, 0.1, 0.1, 0.2]
extension = pd.DataFrame({
'Age': np.random.randint(0, 10, len(iris)),
'Country': np.random.choice(countries, size=len(iris), replace=True, p=probabilities)
}, index=iris.index)
iris_plus = pd.concat((iris, extension), axis=1)
iris_plus.head()
SepalLength | SepalWidth | PetalLength | PetalWidth | Species | Age | Country | |
---|---|---|---|---|---|---|---|
lab 0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa | 8 | Spain |
lab 1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa | 8 | US |
lab 2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa | 1 | France |
lab 3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa | 3 | France |
lab 4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa | 5 | Italy |
iris_plus["Country"].value_counts().plot.barh(title="Number of observations given a country");
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(16, 5))
for i, var in enumerate(["Age", "Country"]):
iris_plus[var].value_counts().plot.pie(x=var, ax=ax[i])
ax[i].set_title(f"numerical proportion of {var} in iris_plus dataset");
# compact way with groupby
iris_plus[["SepalLength", "SepalWidth", "Species"]].groupby("Species").mean()
SepalLength | SepalWidth | |
---|---|---|
Species | ||
Iris-setosa | 5.006 | 3.418 |
Iris-versicolor | 5.936 | 2.770 |
Iris-virginica | 6.588 | 2.974 |
# alternatively
for sp in ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]:
print("\nmean for " + sp)
print(iris_plus.loc[ iris_plus["Species"] == sp ,["SepalLength", "SepalWidth"]].mean(axis=0))
mean for Iris-setosa SepalLength 5.006 SepalWidth 3.418 dtype: float64 mean for Iris-versicolor SepalLength 5.936 SepalWidth 2.770 dtype: float64 mean for Iris-virginica SepalLength 6.588 SepalWidth 2.974 dtype: float64