{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"cell_id": "2d1b623617224d72896ab43e2d7a7d5a",
"deepnote_cell_type": "markdown"
},
"source": [
"# Pandas dataframe\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"\n",
"Download the [penguins dataset](https://github.com/mwaskom/seaborn-data/blob/master/penguins.csv) from the internet.\n",
"\n",
"If pandas is not installed, you can install it using the following command:\n",
"\n",
"`%conda install pandas`\n",
"\n",
"or \n",
"\n",
"`%pip install pandas`"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
" sex | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.1 | \n",
" 18.7 | \n",
" 181.0 | \n",
" 3750.0 | \n",
" MALE | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.5 | \n",
" 17.4 | \n",
" 186.0 | \n",
" 3800.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 2 | \n",
" Adelie | \n",
" Torgersen | \n",
" 40.3 | \n",
" 18.0 | \n",
" 195.0 | \n",
" 3250.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 3 | \n",
" Adelie | \n",
" Torgersen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" Adelie | \n",
" Torgersen | \n",
" 36.7 | \n",
" 19.3 | \n",
" 193.0 | \n",
" 3450.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n",
"0 Adelie Torgersen 39.1 18.7 181.0 \n",
"1 Adelie Torgersen 39.5 17.4 186.0 \n",
"2 Adelie Torgersen 40.3 18.0 195.0 \n",
"3 Adelie Torgersen NaN NaN NaN \n",
"4 Adelie Torgersen 36.7 19.3 193.0 \n",
"\n",
" body_mass_g sex \n",
"0 3750.0 MALE \n",
"1 3800.0 FEMALE \n",
"2 3250.0 FEMALE \n",
"3 NaN NaN \n",
"4 3450.0 FEMALE "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('penguins.csv')\n",
"df.head()\n",
"# you need to put the penguins.csv file in the same directory as this notebook\n",
"# otherwise you need to specify the path to the file e.g. pd.read_csv('/path/to/penguins.csv')\n",
"# You can get the current working directory of the notebook by running the following command\n",
"# !pwd\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DataFrames\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.frame.DataFrame"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"species object\n",
"island object\n",
"bill_length_mm float64\n",
"bill_depth_mm float64\n",
"flipper_length_mm float64\n",
"body_mass_g float64\n",
"sex object\n",
"dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the type of each column\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(344, 7)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the number of rows and columns\n",
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Indexing and slicing\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 Adelie\n",
"1 Adelie\n",
"2 Adelie\n",
"3 Adelie\n",
"4 Adelie\n",
" ... \n",
"339 Gentoo\n",
"340 Gentoo\n",
"341 Gentoo\n",
"342 Gentoo\n",
"343 Gentoo\n",
"Name: species, Length: 344, dtype: object\n"
]
},
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get column\n",
"col = df['species']\n",
"print(col)\n",
"type(col)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# althernatively\n",
"col = df.species"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"43.92192982456142"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.bill_length_mm.mean()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
" sex | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.1 | \n",
" 18.7 | \n",
" 181.0 | \n",
" 3750.0 | \n",
" MALE | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.5 | \n",
" 17.4 | \n",
" 186.0 | \n",
" 3800.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n",
"0 Adelie Torgersen 39.1 18.7 181.0 \n",
"1 Adelie Torgersen 39.5 17.4 186.0 \n",
"\n",
" body_mass_g sex \n",
"0 3750.0 MALE \n",
"1 3800.0 FEMALE "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the first two rows \n",
"df[0:2]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
" sex | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.1 | \n",
" 18.7 | \n",
" 181.0 | \n",
" 3750.0 | \n",
" MALE | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.5 | \n",
" 17.4 | \n",
" 186.0 | \n",
" 3800.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 2 | \n",
" Adelie | \n",
" Torgersen | \n",
" 40.3 | \n",
" 18.0 | \n",
" 195.0 | \n",
" 3250.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 3 | \n",
" Adelie | \n",
" Torgersen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" Adelie | \n",
" Torgersen | \n",
" 36.7 | \n",
" 19.3 | \n",
" 193.0 | \n",
" 3450.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 147 | \n",
" Adelie | \n",
" Dream | \n",
" 36.6 | \n",
" 18.4 | \n",
" 184.0 | \n",
" 3475.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 148 | \n",
" Adelie | \n",
" Dream | \n",
" 36.0 | \n",
" 17.8 | \n",
" 195.0 | \n",
" 3450.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 149 | \n",
" Adelie | \n",
" Dream | \n",
" 37.8 | \n",
" 18.1 | \n",
" 193.0 | \n",
" 3750.0 | \n",
" MALE | \n",
"
\n",
" \n",
" 150 | \n",
" Adelie | \n",
" Dream | \n",
" 36.0 | \n",
" 17.1 | \n",
" 187.0 | \n",
" 3700.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 151 | \n",
" Adelie | \n",
" Dream | \n",
" 41.5 | \n",
" 18.5 | \n",
" 201.0 | \n",
" 4000.0 | \n",
" MALE | \n",
"
\n",
" \n",
"
\n",
"
152 rows × 7 columns
\n",
"
"
],
"text/plain": [
" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n",
"0 Adelie Torgersen 39.1 18.7 181.0 \n",
"1 Adelie Torgersen 39.5 17.4 186.0 \n",
"2 Adelie Torgersen 40.3 18.0 195.0 \n",
"3 Adelie Torgersen NaN NaN NaN \n",
"4 Adelie Torgersen 36.7 19.3 193.0 \n",
".. ... ... ... ... ... \n",
"147 Adelie Dream 36.6 18.4 184.0 \n",
"148 Adelie Dream 36.0 17.8 195.0 \n",
"149 Adelie Dream 37.8 18.1 193.0 \n",
"150 Adelie Dream 36.0 17.1 187.0 \n",
"151 Adelie Dream 41.5 18.5 201.0 \n",
"\n",
" body_mass_g sex \n",
"0 3750.0 MALE \n",
"1 3800.0 FEMALE \n",
"2 3250.0 FEMALE \n",
"3 NaN NaN \n",
"4 3450.0 FEMALE \n",
".. ... ... \n",
"147 3475.0 FEMALE \n",
"148 3450.0 FEMALE \n",
"149 3750.0 MALE \n",
"150 3700.0 FEMALE \n",
"151 4000.0 MALE \n",
"\n",
"[152 rows x 7 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# use boolean indexing to filter the data\n",
"df[df['species'] == 'Adelie']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" 2 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species island\n",
"0 Adelie Torgersen\n",
"1 Adelie Torgersen\n",
"2 Adelie Torgersen"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# .loc is primarily label based, in this example, 0, 1, 2 are the labels of the rows.\n",
"# In some other cases, the row label may be a string such as 'a', 'b', 'c', etc.\n",
"# May also be used with a boolean array.\n",
"\n",
"# for loc, end index is included\n",
"df.loc[0:2, ['species', 'island']]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" 2 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" 3 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" 4 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 147 | \n",
" Adelie | \n",
" Dream | \n",
"
\n",
" \n",
" 148 | \n",
" Adelie | \n",
" Dream | \n",
"
\n",
" \n",
" 149 | \n",
" Adelie | \n",
" Dream | \n",
"
\n",
" \n",
" 150 | \n",
" Adelie | \n",
" Dream | \n",
"
\n",
" \n",
" 151 | \n",
" Adelie | \n",
" Dream | \n",
"
\n",
" \n",
"
\n",
"
152 rows × 2 columns
\n",
"
"
],
"text/plain": [
" species island\n",
"0 Adelie Torgersen\n",
"1 Adelie Torgersen\n",
"2 Adelie Torgersen\n",
"3 Adelie Torgersen\n",
"4 Adelie Torgersen\n",
".. ... ...\n",
"147 Adelie Dream\n",
"148 Adelie Dream\n",
"149 Adelie Dream\n",
"150 Adelie Dream\n",
"151 Adelie Dream\n",
"\n",
"[152 rows x 2 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[df.species == 'Adelie', ['species', 'island']]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 39.1 | \n",
" 18.7 | \n",
" 181.0 | \n",
" 3750.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 39.5 | \n",
" 17.4 | \n",
" 186.0 | \n",
" 3800.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 40.3 | \n",
" 18.0 | \n",
" 195.0 | \n",
" 3250.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bill_length_mm bill_depth_mm flipper_length_mm body_mass_g\n",
"0 39.1 18.7 181.0 3750.0\n",
"1 39.5 17.4 186.0 3800.0\n",
"2 40.3 18.0 195.0 3250.0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get a subset of the data frame\n",
"df.loc[:2, 'bill_length_mm':'body_mass_g']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species island\n",
"0 Adelie Torgersen\n",
"1 Adelie Torgersen"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# .iloc is primarily integer position based (from 0 to length-1 of the axis)\n",
"# may also be used with a boolean array.\n",
"df.iloc[0:2, 0:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Descriptive statistics"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 342.000000 | \n",
" 342.000000 | \n",
" 342.000000 | \n",
" 342.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 43.921930 | \n",
" 17.151170 | \n",
" 200.915205 | \n",
" 4201.754386 | \n",
"
\n",
" \n",
" std | \n",
" 5.459584 | \n",
" 1.974793 | \n",
" 14.061714 | \n",
" 801.954536 | \n",
"
\n",
" \n",
" min | \n",
" 32.100000 | \n",
" 13.100000 | \n",
" 172.000000 | \n",
" 2700.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 39.225000 | \n",
" 15.600000 | \n",
" 190.000000 | \n",
" 3550.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 44.450000 | \n",
" 17.300000 | \n",
" 197.000000 | \n",
" 4050.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 48.500000 | \n",
" 18.700000 | \n",
" 213.000000 | \n",
" 4750.000000 | \n",
"
\n",
" \n",
" max | \n",
" 59.600000 | \n",
" 21.500000 | \n",
" 231.000000 | \n",
" 6300.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bill_length_mm bill_depth_mm flipper_length_mm body_mass_g\n",
"count 342.000000 342.000000 342.000000 342.000000\n",
"mean 43.921930 17.151170 200.915205 4201.754386\n",
"std 5.459584 1.974793 14.061714 801.954536\n",
"min 32.100000 13.100000 172.000000 2700.000000\n",
"25% 39.225000 15.600000 190.000000 3550.000000\n",
"50% 44.450000 17.300000 197.000000 4050.000000\n",
"75% 48.500000 18.700000 213.000000 4750.000000\n",
"max 59.600000 21.500000 231.000000 6300.000000"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 344 entries, 0 to 343\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 species 344 non-null object \n",
" 1 island 344 non-null object \n",
" 2 bill_length_mm 342 non-null float64\n",
" 3 bill_depth_mm 342 non-null float64\n",
" 4 flipper_length_mm 342 non-null float64\n",
" 5 body_mass_g 342 non-null float64\n",
" 6 sex 333 non-null object \n",
"dtypes: float64(4), object(3)\n",
"memory usage: 18.9+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Adelie 152\n",
"Gentoo 124\n",
"Chinstrap 68\n",
"Name: species, dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"species\"].value_counts(dropna=False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3700.662251655629"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute the mean of one species\n",
"\n",
"df[df['species'] == 'Adelie']['body_mass_g'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
"
\n",
" \n",
" species | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Adelie | \n",
" 38.791391 | \n",
" 18.346358 | \n",
" 189.953642 | \n",
" 3700.662252 | \n",
"
\n",
" \n",
" Chinstrap | \n",
" 48.833824 | \n",
" 18.420588 | \n",
" 195.823529 | \n",
" 3733.088235 | \n",
"
\n",
" \n",
" Gentoo | \n",
" 47.504878 | \n",
" 14.982114 | \n",
" 217.186992 | \n",
" 5076.016260 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bill_length_mm bill_depth_mm flipper_length_mm body_mass_g\n",
"species \n",
"Adelie 38.791391 18.346358 189.953642 3700.662252\n",
"Chinstrap 48.833824 18.420588 195.823529 3733.088235\n",
"Gentoo 47.504878 14.982114 217.186992 5076.016260"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute the mean of all species\n",
"df.groupby(\"species\")[[\"bill_length_mm\", \"bill_depth_mm\", \"flipper_length_mm\", \"body_mass_g\"]].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Missing values"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
" sex | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" Adelie | \n",
" Torgersen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 8 | \n",
" Adelie | \n",
" Torgersen | \n",
" 34.1 | \n",
" 18.1 | \n",
" 193.0 | \n",
" 3475.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 9 | \n",
" Adelie | \n",
" Torgersen | \n",
" 42.0 | \n",
" 20.2 | \n",
" 190.0 | \n",
" 4250.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 10 | \n",
" Adelie | \n",
" Torgersen | \n",
" 37.8 | \n",
" 17.1 | \n",
" 186.0 | \n",
" 3300.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 11 | \n",
" Adelie | \n",
" Torgersen | \n",
" 37.8 | \n",
" 17.3 | \n",
" 180.0 | \n",
" 3700.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 47 | \n",
" Adelie | \n",
" Dream | \n",
" 37.5 | \n",
" 18.9 | \n",
" 179.0 | \n",
" 2975.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 246 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 44.5 | \n",
" 14.3 | \n",
" 216.0 | \n",
" 4100.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 286 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 46.2 | \n",
" 14.4 | \n",
" 214.0 | \n",
" 4650.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 324 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 47.3 | \n",
" 13.8 | \n",
" 216.0 | \n",
" 4725.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 336 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 44.5 | \n",
" 15.7 | \n",
" 217.0 | \n",
" 4875.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 339 | \n",
" Gentoo | \n",
" Biscoe | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n",
"3 Adelie Torgersen NaN NaN NaN \n",
"8 Adelie Torgersen 34.1 18.1 193.0 \n",
"9 Adelie Torgersen 42.0 20.2 190.0 \n",
"10 Adelie Torgersen 37.8 17.1 186.0 \n",
"11 Adelie Torgersen 37.8 17.3 180.0 \n",
"47 Adelie Dream 37.5 18.9 179.0 \n",
"246 Gentoo Biscoe 44.5 14.3 216.0 \n",
"286 Gentoo Biscoe 46.2 14.4 214.0 \n",
"324 Gentoo Biscoe 47.3 13.8 216.0 \n",
"336 Gentoo Biscoe 44.5 15.7 217.0 \n",
"339 Gentoo Biscoe NaN NaN NaN \n",
"\n",
" body_mass_g sex \n",
"3 NaN NaN \n",
"8 3475.0 NaN \n",
"9 4250.0 NaN \n",
"10 3300.0 NaN \n",
"11 3700.0 NaN \n",
"47 2975.0 NaN \n",
"246 4100.0 NaN \n",
"286 4650.0 NaN \n",
"324 4725.0 NaN \n",
"336 4875.0 NaN \n",
"339 NaN NaN "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show rows with missing values\n",
"df_missinig = df[df.isna().any(axis=1)]\n",
"df_missinig"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### How to handle missing values\n",
"\n",
"There is no one-size-fits-all solution for handling missing value.\n",
"If the dataset is large and only a few values are missing, it might be practical to simply drop these data points. \n",
"Alternatively, techniques such as imputation can be used to fill in the gaps. For instance, if temperature readings are missing in a time series, interpolating based on surrounding data points might be effective. \n",
"\n",
"Sometimes retaining missing values is beneficial, as their absence itself can provide meaningful insights into the data. For example, if a survey question is left blank, it might indicate that the respondent was uncomfortable answering it.\n",
"Certain algorithms are designed to interpret and learn from these missing patterns, adding an extra layer of analysis.\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_g | \n",
" sex | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.1 | \n",
" 18.7 | \n",
" 181.0 | \n",
" 3750.0 | \n",
" MALE | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.5 | \n",
" 17.4 | \n",
" 186.0 | \n",
" 3800.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 2 | \n",
" Adelie | \n",
" Torgersen | \n",
" 40.3 | \n",
" 18.0 | \n",
" 195.0 | \n",
" 3250.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 4 | \n",
" Adelie | \n",
" Torgersen | \n",
" 36.7 | \n",
" 19.3 | \n",
" 193.0 | \n",
" 3450.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 5 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.3 | \n",
" 20.6 | \n",
" 190.0 | \n",
" 3650.0 | \n",
" MALE | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 338 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 47.2 | \n",
" 13.7 | \n",
" 214.0 | \n",
" 4925.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 340 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 46.8 | \n",
" 14.3 | \n",
" 215.0 | \n",
" 4850.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 341 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 50.4 | \n",
" 15.7 | \n",
" 222.0 | \n",
" 5750.0 | \n",
" MALE | \n",
"
\n",
" \n",
" 342 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 45.2 | \n",
" 14.8 | \n",
" 212.0 | \n",
" 5200.0 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 343 | \n",
" Gentoo | \n",
" Biscoe | \n",
" 49.9 | \n",
" 16.1 | \n",
" 213.0 | \n",
" 5400.0 | \n",
" MALE | \n",
"
\n",
" \n",
"
\n",
"
342 rows × 7 columns
\n",
"
"
],
"text/plain": [
" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n",
"0 Adelie Torgersen 39.1 18.7 181.0 \n",
"1 Adelie Torgersen 39.5 17.4 186.0 \n",
"2 Adelie Torgersen 40.3 18.0 195.0 \n",
"4 Adelie Torgersen 36.7 19.3 193.0 \n",
"5 Adelie Torgersen 39.3 20.6 190.0 \n",
".. ... ... ... ... ... \n",
"338 Gentoo Biscoe 47.2 13.7 214.0 \n",
"340 Gentoo Biscoe 46.8 14.3 215.0 \n",
"341 Gentoo Biscoe 50.4 15.7 222.0 \n",
"342 Gentoo Biscoe 45.2 14.8 212.0 \n",
"343 Gentoo Biscoe 49.9 16.1 213.0 \n",
"\n",
" body_mass_g sex \n",
"0 3750.0 MALE \n",
"1 3800.0 FEMALE \n",
"2 3250.0 FEMALE \n",
"4 3450.0 FEMALE \n",
"5 3650.0 MALE \n",
".. ... ... \n",
"338 4925.0 FEMALE \n",
"340 4850.0 FEMALE \n",
"341 5750.0 MALE \n",
"342 5200.0 FEMALE \n",
"343 5400.0 MALE \n",
"\n",
"[342 rows x 7 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Drop rows with missing values\n",
"# you can use inplace=True to modify the original data frame\n",
"\n",
"# In our dataset, we can see that there are two samples with no numerical measurements\n",
"# There are 11 samples with only sex missing\n",
"\n",
"# we can drop all rows with missing values, becasue that's a small fraction of the data\n",
"\n",
"# maybe we can drop rows with missing numerical measurements and keep those with missing sex\n",
"# This can be achieved by thresh=6, which means that the row must have at least 6 non-missing values\n",
"\n",
"df_clean = df.dropna(thresh=6)\n",
"df_clean"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Apply\n",
"\n",
"The `apply` method is used to apply a function along the axis of a DataFrame or Series."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Convert the body_mass_g column to kg\n",
"df['body_mass_g'] = df['body_mass_g'].apply(lambda x: x/1000)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_kg | \n",
" sex | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.1 | \n",
" 18.7 | \n",
" 181.0 | \n",
" 3.75 | \n",
" MALE | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.5 | \n",
" 17.4 | \n",
" 186.0 | \n",
" 3.80 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 2 | \n",
" Adelie | \n",
" Torgersen | \n",
" 40.3 | \n",
" 18.0 | \n",
" 195.0 | \n",
" 3.25 | \n",
" FEMALE | \n",
"
\n",
" \n",
" 3 | \n",
" Adelie | \n",
" Torgersen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" Adelie | \n",
" Torgersen | \n",
" 36.7 | \n",
" 19.3 | \n",
" 193.0 | \n",
" 3.45 | \n",
" FEMALE | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n",
"0 Adelie Torgersen 39.1 18.7 181.0 \n",
"1 Adelie Torgersen 39.5 17.4 186.0 \n",
"2 Adelie Torgersen 40.3 18.0 195.0 \n",
"3 Adelie Torgersen NaN NaN NaN \n",
"4 Adelie Torgersen 36.7 19.3 193.0 \n",
"\n",
" body_mass_kg sex \n",
"0 3.75 MALE \n",
"1 3.80 FEMALE \n",
"2 3.25 FEMALE \n",
"3 NaN NaN \n",
"4 3.45 FEMALE "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rename the column\n",
"df.rename(columns={'body_mass_g': 'body_mass_kg'}, inplace=True)\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" species | \n",
" island | \n",
" bill_length_mm | \n",
" bill_depth_mm | \n",
" flipper_length_mm | \n",
" body_mass_kg | \n",
" sex | \n",
" bill_ratio | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.1 | \n",
" 18.7 | \n",
" 181.0 | \n",
" 3.75 | \n",
" MALE | \n",
" 0.216022 | \n",
"
\n",
" \n",
" 1 | \n",
" Adelie | \n",
" Torgersen | \n",
" 39.5 | \n",
" 17.4 | \n",
" 186.0 | \n",
" 3.80 | \n",
" FEMALE | \n",
" 0.212366 | \n",
"
\n",
" \n",
" 2 | \n",
" Adelie | \n",
" Torgersen | \n",
" 40.3 | \n",
" 18.0 | \n",
" 195.0 | \n",
" 3.25 | \n",
" FEMALE | \n",
" 0.206667 | \n",
"
\n",
" \n",
" 3 | \n",
" Adelie | \n",
" Torgersen | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" Adelie | \n",
" Torgersen | \n",
" 36.7 | \n",
" 19.3 | \n",
" 193.0 | \n",
" 3.45 | \n",
" FEMALE | \n",
" 0.190155 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n",
"0 Adelie Torgersen 39.1 18.7 181.0 \n",
"1 Adelie Torgersen 39.5 17.4 186.0 \n",
"2 Adelie Torgersen 40.3 18.0 195.0 \n",
"3 Adelie Torgersen NaN NaN NaN \n",
"4 Adelie Torgersen 36.7 19.3 193.0 \n",
"\n",
" body_mass_kg sex bill_ratio \n",
"0 3.75 MALE 0.216022 \n",
"1 3.80 FEMALE 0.212366 \n",
"2 3.25 FEMALE 0.206667 \n",
"3 NaN NaN NaN \n",
"4 3.45 FEMALE 0.190155 "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute the ratio of bill length to flipper length\n",
"df['bill_ratio'] = df.apply(lambda row: row['bill_length_mm'] / row['flipper_length_mm'], axis=1)\n",
"df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# View or Copy\n",
"\n",
"Use iloc or loc to modify the DataFrame.\n",
"\n",
"Be careful when slice a dataframe, and modify it. It is better to use the `copy()` method to avoid modifying the original DataFrame.\n",
"\n",
"For more detailed disucssion, see [here](https://www.practicaldatascience.org/html/views_and_copies_in_pandas.html)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# the suggested way to modify the dataframe is to use .loc or .iloc\n",
"tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
"print(tmp)\n",
"tmp.loc[0, 'A'] = 10\n",
"print(tmp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example data frame\n",
"tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
"print(tmp)\n",
"# this is a view of the original dataframe\n",
"col_A = tmp['A']\n",
"# this modify the original dataframe\n",
"col_A[0] = 10\n",
"print(tmp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Similarly, changing tmp also changes col_A\n",
"tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
"# this is a view of the original dataframe\n",
"col_A = tmp['A']\n",
"# this modify the original dataframe\n",
"tmp['A'][0] = 10\n",
"print(col_A)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This gives a warning\n",
"tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
"tmp['A'][0] = 10\n",
"print(tmp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
"# this is a copy of the column\n",
"col_A = tmp['A'].copy()\n",
"# change the copy does not change the original dataframe\n",
"col_A[0] = 10\n",
"print(tmp)"
]
}
],
"metadata": {
"deepnote": {},
"deepnote_execution_queue": [],
"deepnote_notebook_id": "10c00f3043944b9ebe3ee417968fcfd0",
"deepnote_persisted_session": {
"createdAt": "2023-10-16T21:31:24.983Z"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}