{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "cell_id": "2d1b623617224d72896ab43e2d7a7d5a",
    "deepnote_cell_type": "markdown"
   },
   "source": [
    "# Pandas dataframe\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup\n",
    "\n",
    "\n",
    "Download the [penguins dataset](https://github.com/mwaskom/seaborn-data/blob/master/penguins.csv) from the internet.\n",
    "\n",
    "If pandas is not installed, you can install it using the following command:\n",
    "\n",
    "`%conda install pandas`\n",
    "\n",
    "or \n",
    "\n",
    "`%pip install pandas`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3750.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3800.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>40.3</td>\n",
       "      <td>18.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3250.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>36.7</td>\n",
       "      <td>19.3</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "0  Adelie  Torgersen            39.1           18.7              181.0   \n",
       "1  Adelie  Torgersen            39.5           17.4              186.0   \n",
       "2  Adelie  Torgersen            40.3           18.0              195.0   \n",
       "3  Adelie  Torgersen             NaN            NaN                NaN   \n",
       "4  Adelie  Torgersen            36.7           19.3              193.0   \n",
       "\n",
       "   body_mass_g     sex  \n",
       "0       3750.0    MALE  \n",
       "1       3800.0  FEMALE  \n",
       "2       3250.0  FEMALE  \n",
       "3          NaN     NaN  \n",
       "4       3450.0  FEMALE  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv('penguins.csv')\n",
    "df.head()\n",
    "# you need to put the penguins.csv file in the same directory as this notebook\n",
    "# otherwise you need to specify the path to the file e.g. pd.read_csv('/path/to/penguins.csv')\n",
    "# You can get the current working directory of the notebook by running the following command\n",
    "# !pwd\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DataFrames\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "species               object\n",
       "island                object\n",
       "bill_length_mm       float64\n",
       "bill_depth_mm        float64\n",
       "flipper_length_mm    float64\n",
       "body_mass_g          float64\n",
       "sex                   object\n",
       "dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get the type of each column\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(344, 7)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get the number of rows and columns\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Indexing and slicing\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0      Adelie\n",
      "1      Adelie\n",
      "2      Adelie\n",
      "3      Adelie\n",
      "4      Adelie\n",
      "        ...  \n",
      "339    Gentoo\n",
      "340    Gentoo\n",
      "341    Gentoo\n",
      "342    Gentoo\n",
      "343    Gentoo\n",
      "Name: species, Length: 344, dtype: object\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get column\n",
    "col = df['species']\n",
    "print(col)\n",
    "type(col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# althernatively\n",
    "col = df.species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "43.92192982456142"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.bill_length_mm.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3750.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3800.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "0  Adelie  Torgersen            39.1           18.7              181.0   \n",
       "1  Adelie  Torgersen            39.5           17.4              186.0   \n",
       "\n",
       "   body_mass_g     sex  \n",
       "0       3750.0    MALE  \n",
       "1       3800.0  FEMALE  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get the first two rows \n",
    "df[0:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3750.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3800.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>40.3</td>\n",
       "      <td>18.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3250.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>36.7</td>\n",
       "      <td>19.3</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "      <td>36.6</td>\n",
       "      <td>18.4</td>\n",
       "      <td>184.0</td>\n",
       "      <td>3475.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "      <td>36.0</td>\n",
       "      <td>17.8</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "      <td>37.8</td>\n",
       "      <td>18.1</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3750.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "      <td>36.0</td>\n",
       "      <td>17.1</td>\n",
       "      <td>187.0</td>\n",
       "      <td>3700.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "      <td>41.5</td>\n",
       "      <td>18.5</td>\n",
       "      <td>201.0</td>\n",
       "      <td>4000.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>152 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "0    Adelie  Torgersen            39.1           18.7              181.0   \n",
       "1    Adelie  Torgersen            39.5           17.4              186.0   \n",
       "2    Adelie  Torgersen            40.3           18.0              195.0   \n",
       "3    Adelie  Torgersen             NaN            NaN                NaN   \n",
       "4    Adelie  Torgersen            36.7           19.3              193.0   \n",
       "..      ...        ...             ...            ...                ...   \n",
       "147  Adelie      Dream            36.6           18.4              184.0   \n",
       "148  Adelie      Dream            36.0           17.8              195.0   \n",
       "149  Adelie      Dream            37.8           18.1              193.0   \n",
       "150  Adelie      Dream            36.0           17.1              187.0   \n",
       "151  Adelie      Dream            41.5           18.5              201.0   \n",
       "\n",
       "     body_mass_g     sex  \n",
       "0         3750.0    MALE  \n",
       "1         3800.0  FEMALE  \n",
       "2         3250.0  FEMALE  \n",
       "3            NaN     NaN  \n",
       "4         3450.0  FEMALE  \n",
       "..           ...     ...  \n",
       "147       3475.0  FEMALE  \n",
       "148       3450.0  FEMALE  \n",
       "149       3750.0    MALE  \n",
       "150       3700.0  FEMALE  \n",
       "151       4000.0    MALE  \n",
       "\n",
       "[152 rows x 7 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use boolean indexing to filter the data\n",
    "df[df['species'] == 'Adelie']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  species     island\n",
       "0  Adelie  Torgersen\n",
       "1  Adelie  Torgersen\n",
       "2  Adelie  Torgersen"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# .loc is primarily label based, in this example, 0, 1, 2 are the labels of the rows.\n",
    "# In some other cases, the row label may be a string such as 'a', 'b', 'c', etc.\n",
    "# May also be used with a boolean array.\n",
    "\n",
    "# for loc, end index is included\n",
    "df.loc[0:2, ['species', 'island']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>152 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    species     island\n",
       "0    Adelie  Torgersen\n",
       "1    Adelie  Torgersen\n",
       "2    Adelie  Torgersen\n",
       "3    Adelie  Torgersen\n",
       "4    Adelie  Torgersen\n",
       "..      ...        ...\n",
       "147  Adelie      Dream\n",
       "148  Adelie      Dream\n",
       "149  Adelie      Dream\n",
       "150  Adelie      Dream\n",
       "151  Adelie      Dream\n",
       "\n",
       "[152 rows x 2 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[df.species == 'Adelie', ['species', 'island']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3750.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3800.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>40.3</td>\n",
       "      <td>18.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3250.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g\n",
       "0            39.1           18.7              181.0       3750.0\n",
       "1            39.5           17.4              186.0       3800.0\n",
       "2            40.3           18.0              195.0       3250.0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get a subset of the data frame\n",
    "df.loc[:2, 'bill_length_mm':'body_mass_g']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  species     island\n",
       "0  Adelie  Torgersen\n",
       "1  Adelie  Torgersen"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# .iloc is primarily integer position based (from 0 to length-1 of the axis)\n",
    "#  may also be used with a boolean array.\n",
    "df.iloc[0:2, 0:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Descriptive statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>342.000000</td>\n",
       "      <td>342.000000</td>\n",
       "      <td>342.000000</td>\n",
       "      <td>342.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>43.921930</td>\n",
       "      <td>17.151170</td>\n",
       "      <td>200.915205</td>\n",
       "      <td>4201.754386</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>5.459584</td>\n",
       "      <td>1.974793</td>\n",
       "      <td>14.061714</td>\n",
       "      <td>801.954536</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>32.100000</td>\n",
       "      <td>13.100000</td>\n",
       "      <td>172.000000</td>\n",
       "      <td>2700.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>39.225000</td>\n",
       "      <td>15.600000</td>\n",
       "      <td>190.000000</td>\n",
       "      <td>3550.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>44.450000</td>\n",
       "      <td>17.300000</td>\n",
       "      <td>197.000000</td>\n",
       "      <td>4050.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>48.500000</td>\n",
       "      <td>18.700000</td>\n",
       "      <td>213.000000</td>\n",
       "      <td>4750.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>59.600000</td>\n",
       "      <td>21.500000</td>\n",
       "      <td>231.000000</td>\n",
       "      <td>6300.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g\n",
       "count      342.000000     342.000000         342.000000   342.000000\n",
       "mean        43.921930      17.151170         200.915205  4201.754386\n",
       "std          5.459584       1.974793          14.061714   801.954536\n",
       "min         32.100000      13.100000         172.000000  2700.000000\n",
       "25%         39.225000      15.600000         190.000000  3550.000000\n",
       "50%         44.450000      17.300000         197.000000  4050.000000\n",
       "75%         48.500000      18.700000         213.000000  4750.000000\n",
       "max         59.600000      21.500000         231.000000  6300.000000"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 344 entries, 0 to 343\n",
      "Data columns (total 7 columns):\n",
      " #   Column             Non-Null Count  Dtype  \n",
      "---  ------             --------------  -----  \n",
      " 0   species            344 non-null    object \n",
      " 1   island             344 non-null    object \n",
      " 2   bill_length_mm     342 non-null    float64\n",
      " 3   bill_depth_mm      342 non-null    float64\n",
      " 4   flipper_length_mm  342 non-null    float64\n",
      " 5   body_mass_g        342 non-null    float64\n",
      " 6   sex                333 non-null    object \n",
      "dtypes: float64(4), object(3)\n",
      "memory usage: 18.9+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Adelie       152\n",
       "Gentoo       124\n",
       "Chinstrap     68\n",
       "Name: species, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"species\"].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3700.662251655629"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compute the mean of one species\n",
    "\n",
    "df[df['species'] == 'Adelie']['body_mass_g'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>species</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Adelie</th>\n",
       "      <td>38.791391</td>\n",
       "      <td>18.346358</td>\n",
       "      <td>189.953642</td>\n",
       "      <td>3700.662252</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chinstrap</th>\n",
       "      <td>48.833824</td>\n",
       "      <td>18.420588</td>\n",
       "      <td>195.823529</td>\n",
       "      <td>3733.088235</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gentoo</th>\n",
       "      <td>47.504878</td>\n",
       "      <td>14.982114</td>\n",
       "      <td>217.186992</td>\n",
       "      <td>5076.016260</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g\n",
       "species                                                                 \n",
       "Adelie          38.791391      18.346358         189.953642  3700.662252\n",
       "Chinstrap       48.833824      18.420588         195.823529  3733.088235\n",
       "Gentoo          47.504878      14.982114         217.186992  5076.016260"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compute the mean of all species\n",
    "df.groupby(\"species\")[[\"bill_length_mm\", \"bill_depth_mm\", \"flipper_length_mm\", \"body_mass_g\"]].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>34.1</td>\n",
       "      <td>18.1</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3475.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>42.0</td>\n",
       "      <td>20.2</td>\n",
       "      <td>190.0</td>\n",
       "      <td>4250.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>37.8</td>\n",
       "      <td>17.1</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3300.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>37.8</td>\n",
       "      <td>17.3</td>\n",
       "      <td>180.0</td>\n",
       "      <td>3700.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Dream</td>\n",
       "      <td>37.5</td>\n",
       "      <td>18.9</td>\n",
       "      <td>179.0</td>\n",
       "      <td>2975.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>246</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>44.5</td>\n",
       "      <td>14.3</td>\n",
       "      <td>216.0</td>\n",
       "      <td>4100.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>286</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>46.2</td>\n",
       "      <td>14.4</td>\n",
       "      <td>214.0</td>\n",
       "      <td>4650.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>324</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>47.3</td>\n",
       "      <td>13.8</td>\n",
       "      <td>216.0</td>\n",
       "      <td>4725.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>336</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>44.5</td>\n",
       "      <td>15.7</td>\n",
       "      <td>217.0</td>\n",
       "      <td>4875.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>339</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "3    Adelie  Torgersen             NaN            NaN                NaN   \n",
       "8    Adelie  Torgersen            34.1           18.1              193.0   \n",
       "9    Adelie  Torgersen            42.0           20.2              190.0   \n",
       "10   Adelie  Torgersen            37.8           17.1              186.0   \n",
       "11   Adelie  Torgersen            37.8           17.3              180.0   \n",
       "47   Adelie      Dream            37.5           18.9              179.0   \n",
       "246  Gentoo     Biscoe            44.5           14.3              216.0   \n",
       "286  Gentoo     Biscoe            46.2           14.4              214.0   \n",
       "324  Gentoo     Biscoe            47.3           13.8              216.0   \n",
       "336  Gentoo     Biscoe            44.5           15.7              217.0   \n",
       "339  Gentoo     Biscoe             NaN            NaN                NaN   \n",
       "\n",
       "     body_mass_g  sex  \n",
       "3            NaN  NaN  \n",
       "8         3475.0  NaN  \n",
       "9         4250.0  NaN  \n",
       "10        3300.0  NaN  \n",
       "11        3700.0  NaN  \n",
       "47        2975.0  NaN  \n",
       "246       4100.0  NaN  \n",
       "286       4650.0  NaN  \n",
       "324       4725.0  NaN  \n",
       "336       4875.0  NaN  \n",
       "339          NaN  NaN  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# show rows with missing values\n",
    "df_missinig = df[df.isna().any(axis=1)]\n",
    "df_missinig"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to handle missing values\n",
    "\n",
    "There is no one-size-fits-all solution for handling missing value.\n",
    "If the dataset is large and only a few values are missing, it might be practical to simply drop these data points. \n",
    "Alternatively, techniques such as imputation can be used to fill in the gaps. For instance, if temperature readings are missing in a time series, interpolating based on surrounding data points might be effective. \n",
    "\n",
    "Sometimes retaining missing values is beneficial, as their absence itself can provide meaningful insights into the data. For example, if a survey question is left blank, it might indicate that the respondent was uncomfortable answering it.\n",
    "Certain algorithms are designed to interpret and learn from these missing patterns, adding an extra layer of analysis.\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_g</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3750.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3800.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>40.3</td>\n",
       "      <td>18.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3250.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>36.7</td>\n",
       "      <td>19.3</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.3</td>\n",
       "      <td>20.6</td>\n",
       "      <td>190.0</td>\n",
       "      <td>3650.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>338</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>47.2</td>\n",
       "      <td>13.7</td>\n",
       "      <td>214.0</td>\n",
       "      <td>4925.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>340</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>46.8</td>\n",
       "      <td>14.3</td>\n",
       "      <td>215.0</td>\n",
       "      <td>4850.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>341</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>50.4</td>\n",
       "      <td>15.7</td>\n",
       "      <td>222.0</td>\n",
       "      <td>5750.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>342</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>45.2</td>\n",
       "      <td>14.8</td>\n",
       "      <td>212.0</td>\n",
       "      <td>5200.0</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>343</th>\n",
       "      <td>Gentoo</td>\n",
       "      <td>Biscoe</td>\n",
       "      <td>49.9</td>\n",
       "      <td>16.1</td>\n",
       "      <td>213.0</td>\n",
       "      <td>5400.0</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>342 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "0    Adelie  Torgersen            39.1           18.7              181.0   \n",
       "1    Adelie  Torgersen            39.5           17.4              186.0   \n",
       "2    Adelie  Torgersen            40.3           18.0              195.0   \n",
       "4    Adelie  Torgersen            36.7           19.3              193.0   \n",
       "5    Adelie  Torgersen            39.3           20.6              190.0   \n",
       "..      ...        ...             ...            ...                ...   \n",
       "338  Gentoo     Biscoe            47.2           13.7              214.0   \n",
       "340  Gentoo     Biscoe            46.8           14.3              215.0   \n",
       "341  Gentoo     Biscoe            50.4           15.7              222.0   \n",
       "342  Gentoo     Biscoe            45.2           14.8              212.0   \n",
       "343  Gentoo     Biscoe            49.9           16.1              213.0   \n",
       "\n",
       "     body_mass_g     sex  \n",
       "0         3750.0    MALE  \n",
       "1         3800.0  FEMALE  \n",
       "2         3250.0  FEMALE  \n",
       "4         3450.0  FEMALE  \n",
       "5         3650.0    MALE  \n",
       "..           ...     ...  \n",
       "338       4925.0  FEMALE  \n",
       "340       4850.0  FEMALE  \n",
       "341       5750.0    MALE  \n",
       "342       5200.0  FEMALE  \n",
       "343       5400.0    MALE  \n",
       "\n",
       "[342 rows x 7 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Drop rows with missing values\n",
    "# you can use inplace=True to modify the original data frame\n",
    "\n",
    "# In our dataset, we can see that there are two samples with no numerical measurements\n",
    "# There are 11 samples with only sex missing\n",
    "\n",
    "# we can drop all rows with missing values, becasue that's a small fraction of the data\n",
    "\n",
    "# maybe we can drop rows with missing numerical measurements and keep those with missing sex\n",
    "# This can be achieved by thresh=6, which means that the row must have at least 6 non-missing values\n",
    "\n",
    "df_clean = df.dropna(thresh=6)\n",
    "df_clean"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Apply\n",
    "\n",
    "The `apply` method is used to apply a function along the axis of a DataFrame or Series."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the body_mass_g column to kg\n",
    "df['body_mass_g'] = df['body_mass_g'].apply(lambda x: x/1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_kg</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3.75</td>\n",
       "      <td>MALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3.80</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>40.3</td>\n",
       "      <td>18.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3.25</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>36.7</td>\n",
       "      <td>19.3</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3.45</td>\n",
       "      <td>FEMALE</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "0  Adelie  Torgersen            39.1           18.7              181.0   \n",
       "1  Adelie  Torgersen            39.5           17.4              186.0   \n",
       "2  Adelie  Torgersen            40.3           18.0              195.0   \n",
       "3  Adelie  Torgersen             NaN            NaN                NaN   \n",
       "4  Adelie  Torgersen            36.7           19.3              193.0   \n",
       "\n",
       "   body_mass_kg     sex  \n",
       "0          3.75    MALE  \n",
       "1          3.80  FEMALE  \n",
       "2          3.25  FEMALE  \n",
       "3           NaN     NaN  \n",
       "4          3.45  FEMALE  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# rename the column\n",
    "df.rename(columns={'body_mass_g': 'body_mass_kg'}, inplace=True)\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>species</th>\n",
       "      <th>island</th>\n",
       "      <th>bill_length_mm</th>\n",
       "      <th>bill_depth_mm</th>\n",
       "      <th>flipper_length_mm</th>\n",
       "      <th>body_mass_kg</th>\n",
       "      <th>sex</th>\n",
       "      <th>bill_ratio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.1</td>\n",
       "      <td>18.7</td>\n",
       "      <td>181.0</td>\n",
       "      <td>3.75</td>\n",
       "      <td>MALE</td>\n",
       "      <td>0.216022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>39.5</td>\n",
       "      <td>17.4</td>\n",
       "      <td>186.0</td>\n",
       "      <td>3.80</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>0.212366</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>40.3</td>\n",
       "      <td>18.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>3.25</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>0.206667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Adelie</td>\n",
       "      <td>Torgersen</td>\n",
       "      <td>36.7</td>\n",
       "      <td>19.3</td>\n",
       "      <td>193.0</td>\n",
       "      <td>3.45</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>0.190155</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \\\n",
       "0  Adelie  Torgersen            39.1           18.7              181.0   \n",
       "1  Adelie  Torgersen            39.5           17.4              186.0   \n",
       "2  Adelie  Torgersen            40.3           18.0              195.0   \n",
       "3  Adelie  Torgersen             NaN            NaN                NaN   \n",
       "4  Adelie  Torgersen            36.7           19.3              193.0   \n",
       "\n",
       "   body_mass_kg     sex  bill_ratio  \n",
       "0          3.75    MALE    0.216022  \n",
       "1          3.80  FEMALE    0.212366  \n",
       "2          3.25  FEMALE    0.206667  \n",
       "3           NaN     NaN         NaN  \n",
       "4          3.45  FEMALE    0.190155  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compute the ratio of bill length to flipper length\n",
    "df['bill_ratio'] = df.apply(lambda row: row['bill_length_mm'] / row['flipper_length_mm'], axis=1)\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# View or Copy\n",
    "\n",
    "Use iloc or loc to modify the DataFrame.\n",
    "\n",
    "Be careful when slice a dataframe, and modify it. It is better to use the `copy()` method to avoid modifying the original DataFrame.\n",
    "\n",
    "For more detailed disucssion, see [here](https://www.practicaldatascience.org/html/views_and_copies_in_pandas.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the suggested way to modify the dataframe is to use .loc or .iloc\n",
    "tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
    "print(tmp)\n",
    "tmp.loc[0, 'A'] = 10\n",
    "print(tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example data frame\n",
    "tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
    "print(tmp)\n",
    "# this is a view of the original dataframe\n",
    "col_A = tmp['A']\n",
    "# this modify the original dataframe\n",
    "col_A[0] = 10\n",
    "print(tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Similarly, changing tmp also changes col_A\n",
    "tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
    "# this is a view of the original dataframe\n",
    "col_A = tmp['A']\n",
    "# this modify the original dataframe\n",
    "tmp['A'][0] = 10\n",
    "print(col_A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This gives a warning\n",
    "tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
    "tmp['A'][0] = 10\n",
    "print(tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})\n",
    "# this is a copy of the column\n",
    "col_A = tmp['A'].copy()\n",
    "# change the copy does not change the original dataframe\n",
    "col_A[0] = 10\n",
    "print(tmp)"
   ]
  }
 ],
 "metadata": {
  "deepnote": {},
  "deepnote_execution_queue": [],
  "deepnote_notebook_id": "10c00f3043944b9ebe3ee417968fcfd0",
  "deepnote_persisted_session": {
   "createdAt": "2023-10-16T21:31:24.983Z"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}