! cat data/defra_consumption.csv

;England;Wales;Scotland;N Ireland
Cheese;105;103;103;66
Carcass meat;245;227;242;267
Other meat;685;803;750;586
Fish;147;160;122;93
Fats and oils;193;235;184;209
Sugars;156;175;147;139
Fresh potatoes;720;874;566;1033
Fresh Veg;253;265;171;143
Other Veg;488;570;418;355
Processed potatoes;198;203;220;187
Processed Veg;360;365;337;334
Fresh fruit;1102;1137;957;674
Cereals;1472;1582;1462;1494
Beverages;57;73;53;47
Soft drinks;1374;1256;1572;1506
Alcoholic drinks;375;475;458;135
Confectionery;54;64;62;41


import pandas as pd

consumption = pd.read_csv('data/defra_consumption.csv', sep=';', index_col=0)
consumption.head(10)


print(consumption.shape)
print(consumption.dtypes)

(17, 4)
England      int64
Wales        int64
Scotland     int64
N Ireland    int64
dtype: object


consumption.loc['Cheese':'Cereals':4, :'Wales']


consumption.plot(subplots=True, figsize=(14, 8));  # Columns vs index


import pandas as pd
import numpy as np

iris = pd.read_csv('data/Iris.csv', sep=',', index_col="Id")
iris.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]
iris.index = ["lab " + str(i) for i in range(len(iris))]

# generate iris_plus
countries = ["France", "Italy", "Spain", "China", "US", "Japan"]
probabilities = [0.35, 0.2, 0.05, 0.1, 0.1, 0.2]
extension = pd.DataFrame({
    'Age': np.random.randint(0, 10, len(iris)),
    'Country': np.random.choice(countries, size=len(iris), replace=True, p=probabilities)
}, index=iris.index)

iris_plus = pd.concat((iris, extension), axis=1)
iris_plus.head()


iris_plus["Country"].value_counts().plot.barh(title="Number of observations given a country");


import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(16, 5))

for i, var in enumerate(["Age", "Country"]):
    iris_plus[var].value_counts().plot.pie(x=var, ax=ax[i])
    ax[i].set_title(f"numerical proportion of {var} in iris_plus dataset");


# compact way with groupby
iris_plus[["SepalLength", "SepalWidth", "Species"]].groupby("Species").mean()


# alternatively
for sp in ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]:
    print("\nmean for " + sp)
    print(iris_plus.loc[ iris_plus["Species"] == sp ,["SepalLength", "SepalWidth"]].mean(axis=0))

mean for Iris-setosa
SepalLength    5.006
SepalWidth     3.418
dtype: float64

mean for Iris-versicolor
SepalLength    5.936
SepalWidth     2.770
dtype: float64

mean for Iris-virginica
SepalLength    6.588
SepalWidth     2.974
dtype: float64

Lecture 5 : Data manipulation in Python (Correction)¶

Exercice 1:¶

Exercice 2:¶

Exercice 3:¶

Exercice 4:¶

	England	Wales	Scotland	N Ireland
Cheese	105	103	103	66
Carcass meat	245	227	242	267
Other meat	685	803	750	586
Fish	147	160	122	93
Fats and oils	193	235	184	209
Sugars	156	175	147	139
Fresh potatoes	720	874	566	1033
Fresh Veg	253	265	171	143
Other Veg	488	570	418	355
Processed potatoes	198	203	220	187

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species	Age	Country
lab 0	5.1	3.5	1.4	0.2	Iris-setosa	8	Spain
lab 1	4.9	3.0	1.4	0.2	Iris-setosa	8	US
lab 2	4.7	3.2	1.3	0.2	Iris-setosa	1	France
lab 3	4.6	3.1	1.5	0.2	Iris-setosa	3	France
lab 4	5.0	3.6	1.4	0.2	Iris-setosa	5	Italy

	SepalLength	SepalWidth
Species
Iris-setosa	5.006	3.418
Iris-versicolor	5.936	2.770
Iris-virginica	6.588	2.974