# if we want to write in a .txt file in Python

# "w" is for write mode, we import the file "file_name.txt" as f and after the "with" block, the program close automatically
with open ("file_name.txt", "w") as f:    
    f.write("writing whatever I want in this file...")
    f.write("and adding another information. ")
    f.write("Let us skip two lines: \n\n")
    f.write("Let us add tabulates: \t\t")
    f.write("End.\n")


# "a" is for add mode, in the same file "file_name.txt", let us add more informations
with open ("file_name.txt", "a") as f:    
    f.write("\nAdding an information without erasing the previous informations")


# if we want to read in a .txt file in Python

with open ("file_name.txt", "r") as f:
    for ligne in f:
        print(ligne)

writing whatever I want in this file...and adding another information. Let us skip two lines: 


Let us add tabulates: 		End.


Adding an information without erasing the previous informations


table = []
with open ("data/iris.csv", "r") as f:
    table = f.readlines()
    ##equivalently you can write ...
    #for lines in f: 
    #   table.append(lines)

table[:5] # let's look at the first 5 rows

['Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species\n',
 '1,5.1,3.5,1.4,0.2,Iris-setosa\n',
 '2,4.9,3.0,1.4,0.2,Iris-setosa\n',
 '3,4.7,3.2,1.3,0.2,Iris-setosa\n',
 '4,4.6,3.1,1.5,0.2,Iris-setosa\n']


import pandas as pd


# we gave the file path, how the values are separated in the file and the column chosen for being the observations index
iris = pd.read_csv('data/Iris.csv', sep=',', index_col="Id")
iris


import numpy as np

df1 = pd.DataFrame(data = np.random.rand(6,4),
                   index = ["u", "v", "w", "x", "y", "z"], # if not specified, it will be 0, 1, .. 5
                   columns = ["a", "b", "c", "d"]) # if not specified, it will be 0, 1, .. 5
df1


df2 = pd.DataFrame({'a' : 1.,                                                           # Single item
                    'b' : "same",                                                       # Single item
                    'c' : np.random.rand(5),                                            # Multiple item
                    'd' : pd.Categorical(["test", "train", "train", "train", "test"])}) # Multiple item
df2


# let us add `e` to df1
df1['e'] = np.random.randint(0, 2, size=df1.shape[0])
df1


iris.head(n=3) # first 3 rows / observations of the table


iris.tail(n=3) # last 3 rows / observations of the table


iris.values[:3]  # values are in a numpy array (let us view only the first 3 rows)

array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa']], dtype=object)


# iris data

print(iris.index, end="\n\n")   # index of the table
print(iris.columns, end="\n\n") # columns of the table
print(iris.shape, end="\n\n")   # shape of the table
print(iris.dtypes, end="\n\n")  # data types of each variable

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            141, 142, 143, 144, 145, 146, 147, 148, 149, 150],
           dtype='int64', name='Id', length=150)

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

(150, 5)

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object


# df1 data

print(df1.index, end="\n\n")   # index of the table
print(df1.columns, end="\n\n") # columns of the table
print(df1.shape, end="\n\n")   # shape of the table
print(df1.dtypes, end="\n\n")  # data types of each variable

Index(['u', 'v', 'w', 'x', 'y', 'z'], dtype='object')

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

(6, 5)

a    float64
b    float64
c    float64
d    float64
e      int64
dtype: object


iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


iris.describe()


df1.sort_index(ascending=False)


df1.sort_values(by='a')


iris.head()


print(iris.columns)
iris.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]
print(iris.columns)

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')
Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species'], dtype='object')


iris.head(3)


# you can reset index with:

iris.reset_index(inplace=True) # the Id is now a new column of the dataframe
iris.head(3)


# you can reassign the "Id" columns as a Dataframe index:

iris.set_index("Id", drop=True, inplace=True)
iris.head(3)


# Let us rename the index:
# rk: alternatively we can use `reindex` which creates a new index and reindex the dataframe

iris.index = ["lab " + str(i) for i in range(len(iris))]
iris.head()

## in the same logic but more compact:
#iris.index = "lab " + iris.index.map(str)
#head(iris)


iris['SepalWidth'] # iris is a "DataFrame" and the result of this execution is a "Series"

lab 0      3.5
lab 1      3.0
lab 2      3.2
lab 3      3.1
lab 4      3.6
          ... 
lab 145    3.0
lab 146    2.5
lab 147    3.0
lab 148    3.4
lab 149    3.0
Name: SepalWidth, Length: 150, dtype: float64


s = iris['SepalWidth'] # iris is a "DataFrame" and the result of this execution is a "Series"
s[2]

3.2


iris[['SepalWidth', 'PetalLength']]


iris[['SepalWidth']]


iris.loc["lab 3"]  # Single row

SepalLength            4.6
SepalWidth             3.1
PetalLength            1.5
PetalWidth             0.2
Species        Iris-setosa
Name: lab 3, dtype: object


iris.loc[:, 'SepalWidth'].head()  # Single column

lab 0    3.5
lab 1    3.0
lab 2    3.2
lab 3    3.1
lab 4    3.6
Name: SepalWidth, dtype: float64


iris.loc[['lab 3', 'lab 16']]  # Multiple rows


iris.loc['lab 3':'lab 16']  # Row slicing


iris.loc['lab 3':'lab 16':3]  # Row slicing (with step 3)


iris.loc['lab 3':'lab 16':3, :'PetalWidth']  # Row and column slicing


iris.iloc[:2]


iris.iloc[10:20:2, ::2]


iris.loc[iris['PetalWidth'] > 2.3]  # Row selection


iris.loc[iris['PetalWidth'] > 2.3, ["PetalWidth", "Species"]]  # Row and column selection


# we select only rows with "Iris-virginica" OR 'Iris-virginica' species

iris.loc[ iris['Species'].isin(['Iris-setosa', 'Iris-virginica']) ]


iris.sample(n=5) # 5 random sampling


df = iris.iloc[:10].copy()
df


df['RandomNumbers'] = np.random.rand(10)
df


df.loc['duplicate last row'] = df.iloc[-1,:]
df


df.drop('RandomNumbers', inplace=True, axis=1)
df.drop('duplicate last row', inplace=True, axis=0)
df


# new (very giant!) species

df_row = pd.DataFrame({
    'SepalLength': [10, 20, 30],
    'SepalWidth': [10, 20, 30],
    'PetalLength': [10, 20, 30],
    'PetalWidth': [10, 20, 30],
    'Species': ["Iris-giant", "Iris-giant", "Iris-giant"]
}, index=["new 1", "new 2", "new 3"])
df_row


# new information: "Age" and "Country" of the iris

df_col = pd.DataFrame({
    'Age': np.random.randint(0, 10, 10),
    'Country': ["France", "Italy", "Spain", "China", "US", "France", "Spain", "Spain", "France", "Japan"]
}, index=df.index)
df_col


# concatenate with axis=0

pd.concat((df, df_row), axis=0)


# concatenate with axis=1 and assign it to the variable name `data`

data = pd.concat((df, df_col), axis=1)
data


df.drop(["Species"], axis=1, inplace=True)
df


df.median()  # Median of numeric columns
# alternatively: df.median(axis=0)

SepalLength    4.9
SepalWidth     3.3
PetalLength    1.4
PetalWidth     0.2
dtype: float64


df.median(axis=1)  # Median of rows (numeric objects only)

lab 0    2.45
lab 1    2.20
lab 2    2.25
lab 3    2.30
lab 4    2.50
lab 5    2.80
lab 6    2.40
lab 7    2.45
lab 8    2.15
lab 9    2.30
dtype: float64


df['SepalLength'].value_counts()

4.9    2
4.6    2
5.0    2
5.1    1
4.7    1
5.4    1
4.4    1
Name: SepalLength, dtype: int64


df.max(axis=0)

SepalLength    5.4
SepalWidth     3.9
PetalLength    1.7
PetalWidth     0.4
dtype: float64


iris.head()


# generate iris_plus

countries = ["France", "Italy", "Spain", "China", "US", "Japan"]
probabilities = [0.35, 0.2, 0.05, 0.1, 0.1, 0.2]

extension = pd.DataFrame({
    'Age': np.random.randint(0, 10, len(iris)),
    'Country': np.random.choice(countries, size=len(iris), replace=True, p=probabilities)
}, index=iris.index)

iris_plus = pd.concat((iris, extension), axis=1)


iris_plus.head()


ax = iris.plot(figsize=(10, 6))  # Columns vs index
ax.set_title("Line plot w.r.t. species");


iris.plot(subplots=True, figsize=(10, 8));  # Columns vs index


ax = iris.plot.hist(alpha=0.5, figsize=(10, 6))
ax.set_title("Histogram");
ax.set_xlabel("in cm")

Text(0.5, 0, 'in cm')


iris.plot.hist(alpha=0.5, subplots=True, figsize=(10, 8));


iris.plot.scatter(x="SepalLength", y="PetalLength");

	a	b	c	d
u	0.038331	0.768922	0.916012	0.042691
v	0.240109	0.470671	0.262923	0.659971
w	0.923666	0.379960	0.594586	0.666815
x	0.431028	0.239062	0.145889	0.226571
y	0.155564	0.236447	0.302043	0.707633
z	0.349677	0.966427	0.663219	0.040674

	a	b	c	d	e
u	0.038331	0.768922	0.916012	0.042691	0
v	0.240109	0.470671	0.262923	0.659971	1
w	0.923666	0.379960	0.594586	0.666815	0
x	0.431028	0.239062	0.145889	0.226571	0
y	0.155564	0.236447	0.302043	0.707633	0
z	0.349677	0.966427	0.663219	0.040674	1

	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.054000	3.758667	1.198667
std	0.828066	0.433594	1.764420	0.763161
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

	a	b	c	d	e
z	0.349677	0.966427	0.663219	0.040674	1
y	0.155564	0.236447	0.302043	0.707633	0
x	0.431028	0.239062	0.145889	0.226571	0
w	0.923666	0.379960	0.594586	0.666815	0
v	0.240109	0.470671	0.262923	0.659971	1
u	0.038331	0.768922	0.916012	0.042691	0

	a	b	c	d	e
u	0.038331	0.768922	0.916012	0.042691	0
y	0.155564	0.236447	0.302043	0.707633	0
v	0.240109	0.470671	0.262923	0.659971	1
z	0.349677	0.966427	0.663219	0.040674	1
x	0.431028	0.239062	0.145889	0.226571	0
w	0.923666	0.379960	0.594586	0.666815	0

François HU

About me, my research, my teaching and my experimentations

Lecture 5 : Data manipulation in Python¶

General introduction (a little long) ¶

Write and add mode¶

read mode¶

external packages¶

1. Introduction to Dataframes ¶

2. Data representation ¶

Reading a dataframe¶

Creating a dataframe¶

Viewing a dataframe¶

3. Data manipulation ¶

Updating row and column label¶

Natural indexing¶

Label based indexing and slicing: method `.loc[]`¶

position based indexing and slicing: method `.iloc[]`¶

Boolean indexing and slicing¶

Adding and deleting items¶

Concatenating two dataframes¶

4. Descriptive statistics ¶

5. Data visualization with Pandas ¶

Line plot¶

Histogram¶

Scatter plot¶

Exercices ¶

Exercice 1:¶

Exercice 2:¶

Exercice 3: Additional plots¶

Exercice 4:¶

	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	Species
Id
1	5.1	3.5	1.4	0.2	Iris-setosa
2	4.9	3.0	1.4	0.2	Iris-setosa
3	4.7	3.2	1.3	0.2	Iris-setosa
4	4.6	3.1	1.5	0.2	Iris-setosa
5	5.0	3.6	1.4	0.2	Iris-setosa
...	...	...	...	...	...
146	6.7	3.0	5.2	2.3	Iris-virginica
147	6.3	2.5	5.0	1.9	Iris-virginica
148	6.5	3.0	5.2	2.0	Iris-virginica
149	6.2	3.4	5.4	2.3	Iris-virginica
150	5.9	3.0	5.1	1.8	Iris-virginica

	a	b	c	d
0	1.0	same	0.498464	test
1	1.0	same	0.521439	train
2	1.0	same	0.409253	train
3	1.0	same	0.132740	train
4	1.0	same	0.418842	test

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
lab 0	5.1	3.5	1.4	0.2	Iris-setosa
lab 1	4.9	3.0	1.4	0.2	Iris-setosa
lab 2	4.7	3.2	1.3	0.2	Iris-setosa
lab 3	4.6	3.1	1.5	0.2	Iris-setosa
lab 4	5.0	3.6	1.4	0.2	Iris-setosa

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
lab 3	4.6	3.1	1.5	0.2	Iris-setosa
lab 6	4.6	3.4	1.4	0.3	Iris-setosa
lab 9	4.9	3.1	1.5	0.1	Iris-setosa
lab 12	4.8	3.0	1.4	0.1	Iris-setosa
lab 15	5.7	4.4	1.5	0.4	Iris-setosa

	SepalLength	PetalLength	Species
lab 10	5.4	1.5	Iris-setosa
lab 12	4.8	1.4	Iris-setosa
lab 14	5.8	1.2	Iris-setosa
lab 16	5.4	1.3	Iris-setosa
lab 18	5.7	1.7	Iris-setosa

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
lab 100	6.3	3.3	6.0	2.5	Iris-virginica
lab 109	7.2	3.6	6.1	2.5	Iris-virginica
lab 114	5.8	2.8	5.1	2.4	Iris-virginica
lab 136	6.3	3.4	5.6	2.4	Iris-virginica
lab 140	6.7	3.1	5.6	2.4	Iris-virginica
lab 144	6.7	3.3	5.7	2.5	Iris-virginica

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
lab 70	5.9	3.2	4.8	1.8	Iris-versicolor
lab 56	6.3	3.3	4.7	1.6	Iris-versicolor
lab 7	5.0	3.4	1.5	0.2	Iris-setosa
lab 53	5.5	2.3	4.0	1.3	Iris-versicolor
lab 22	4.6	3.6	1.0	0.2	Iris-setosa

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species	RandomNumbers
lab 0	5.1	3.5	1.4	0.2	Iris-setosa	0.975563
lab 1	4.9	3.0	1.4	0.2	Iris-setosa	0.898945
lab 2	4.7	3.2	1.3	0.2	Iris-setosa	0.659174
lab 3	4.6	3.1	1.5	0.2	Iris-setosa	0.549566
lab 4	5.0	3.6	1.4	0.2	Iris-setosa	0.220262
lab 5	5.4	3.9	1.7	0.4	Iris-setosa	0.200619
lab 6	4.6	3.4	1.4	0.3	Iris-setosa	0.046385
lab 7	5.0	3.4	1.5	0.2	Iris-setosa	0.441242
lab 8	4.4	2.9	1.4	0.2	Iris-setosa	0.573576
lab 9	4.9	3.1	1.5	0.1	Iris-setosa	0.449876

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
new 1	10	10	10	10	Iris-giant
new 2	20	20	20	20	Iris-giant
new 3	30	30	30	30	Iris-giant

	Age	Country
lab 0	8	France
lab 1	9	Italy
lab 2	4	Spain
lab 3	2	China
lab 4	0	US
lab 5	4	France
lab 6	6	Spain
lab 7	9	Spain
lab 8	5	France
lab 9	1	Japan

Lecture 5 : Data manipulation in Python¶

General introduction (a little long) ¶

Write and add mode¶

read mode¶

external packages¶

1. Introduction to Dataframes ¶

2. Data representation ¶

Reading a dataframe¶

Creating a dataframe¶

Viewing a dataframe¶

3. Data manipulation ¶

Updating row and column label¶

Natural indexing¶

Label based indexing and slicing: method .loc[]¶

position based indexing and slicing: method .iloc[]¶

Boolean indexing and slicing¶

Adding and deleting items¶

Concatenating two dataframes¶

4. Descriptive statistics ¶

5. Data visualization with Pandas ¶

Line plot¶

Histogram¶

Scatter plot¶

Exercices ¶

Exercice 1:¶

Exercice 2:¶

Exercice 3: Additional plots¶

Exercice 4:¶

Label based indexing and slicing: method `.loc[]`¶

position based indexing and slicing: method `.iloc[]`¶