Lecture Week 3 Fri 10/19#
import seaborn as sns
df = sns.load_dataset("penguins")
what is the average body mass for adelie penguins#
df
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
| 340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
| 341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
| 342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
| 343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
344 rows × 7 columns
df[df.species == 'Adelie'].body_mass_g.mean()
3700.662251655629
df.species
0 Adelie
1 Adelie
2 Adelie
3 Adelie
4 Adelie
...
339 Gentoo
340 Gentoo
341 Gentoo
342 Gentoo
343 Gentoo
Name: species, Length: 344, dtype: object
df.species == 'Adelie'
0 True
1 True
2 True
3 True
4 True
...
339 False
340 False
341 False
342 False
343 False
Name: species, Length: 344, dtype: bool
df[df.species == 'Adelie']
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 147 | Adelie | Dream | 36.6 | 18.4 | 184.0 | 3475.0 | Female |
| 148 | Adelie | Dream | 36.0 | 17.8 | 195.0 | 3450.0 | Female |
| 149 | Adelie | Dream | 37.8 | 18.1 | 193.0 | 3750.0 | Male |
| 150 | Adelie | Dream | 36.0 | 17.1 | 187.0 | 3700.0 | Female |
| 151 | Adelie | Dream | 41.5 | 18.5 | 201.0 | 4000.0 | Male |
152 rows × 7 columns
df[df.species == 'Adelie'].body_mass_g
0 3750.0
1 3800.0
2 3250.0
3 NaN
4 3450.0
...
147 3475.0
148 3450.0
149 3750.0
150 3700.0
151 4000.0
Name: body_mass_g, Length: 152, dtype: float64
df[df.species == 'Adelie'].body_mass_g.mean()
3700.662251655629
df.groupby('sex')['body_mass_g'].mean()
sex
Female 3862.272727
Male 4545.684524
Name: body_mass_g, dtype: float64
df.describe()
| bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | |
|---|---|---|---|---|
| count | 342.000000 | 342.000000 | 342.000000 | 342.000000 |
| mean | 43.921930 | 17.151170 | 200.915205 | 4201.754386 |
| std | 5.459584 | 1.974793 | 14.061714 | 801.954536 |
| min | 32.100000 | 13.100000 | 172.000000 | 2700.000000 |
| 25% | 39.225000 | 15.600000 | 190.000000 | 3550.000000 |
| 50% | 44.450000 | 17.300000 | 197.000000 | 4050.000000 |
| 75% | 48.500000 | 18.700000 | 213.000000 | 4750.000000 |
| max | 59.600000 | 21.500000 | 231.000000 | 6300.000000 |
# loc : use row and column label
df.loc[0:2, ['species', 'island']]
| species | island | |
|---|---|---|
| 0 | Adelie | Torgersen |
| 1 | Adelie | Torgersen |
| 2 | Adelie | Torgersen |
# iloc : use row and column number
# follow python convention, end point not included
df.iloc[0:2, 0:3]
| species | island | bill_length_mm | |
|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 |
| 1 | Adelie | Torgersen | 39.5 |
df.iloc[0:2, [1,3,5]]
| island | bill_depth_mm | body_mass_g | |
|---|---|---|---|
| 0 | Torgersen | 18.7 | 3750.0 |
| 1 | Torgersen | 17.4 | 3800.0 |
Q: What is the bill length of the heaviest penguin in the dataset?#
df.body_mass_g
0 3750.0
1 3800.0
2 3250.0
3 NaN
4 3450.0
...
339 NaN
340 4850.0
341 5750.0
342 5200.0
343 5400.0
Name: body_mass_g, Length: 344, dtype: float64
df.body_mass_g.idxmax()
237
df.loc[df.body_mass_g.idxmax()]
species Gentoo
island Biscoe
bill_length_mm 49.2
bill_depth_mm 15.2
flipper_length_mm 221.0
body_mass_g 6300.0
sex Male
Name: 237, dtype: object
df.loc[df.body_mass_g.idxmax(),'bill_length_mm']
49.2
Q: How many penguins are on Dream island and have body mass greater than 4000 grams?#
df['island'] == 'Dream'
0 False
1 False
2 False
3 False
4 False
...
339 False
340 False
341 False
342 False
343 False
Name: island, Length: 344, dtype: bool
df['body_mass_g'] > 4000
0 False
1 False
2 False
3 False
4 False
...
339 False
340 True
341 True
342 True
343 True
Name: body_mass_g, Length: 344, dtype: bool
(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)
0 False
1 False
2 False
3 False
4 False
...
339 False
340 False
341 False
342 False
343 False
Length: 344, dtype: bool
df[(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)]
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 35 | Adelie | Dream | 39.2 | 21.1 | 196.0 | 4150.0 | Male |
| 39 | Adelie | Dream | 39.8 | 19.1 | 184.0 | 4650.0 | Male |
| 43 | Adelie | Dream | 44.1 | 19.7 | 196.0 | 4400.0 | Male |
| 45 | Adelie | Dream | 39.6 | 18.8 | 190.0 | 4600.0 | Male |
| 49 | Adelie | Dream | 42.3 | 21.2 | 191.0 | 4150.0 | Male |
| 91 | Adelie | Dream | 41.1 | 18.1 | 205.0 | 4300.0 | Male |
| 93 | Adelie | Dream | 39.6 | 18.1 | 186.0 | 4450.0 | Male |
| 95 | Adelie | Dream | 40.8 | 18.9 | 208.0 | 4300.0 | Male |
| 97 | Adelie | Dream | 40.3 | 18.5 | 196.0 | 4350.0 | Male |
| 99 | Adelie | Dream | 43.2 | 18.5 | 192.0 | 4100.0 | Male |
| 133 | Adelie | Dream | 37.5 | 18.5 | 199.0 | 4475.0 | Male |
| 139 | Adelie | Dream | 39.7 | 17.9 | 193.0 | 4250.0 | Male |
| 146 | Adelie | Dream | 39.2 | 18.6 | 190.0 | 4250.0 | Male |
| 160 | Chinstrap | Dream | 46.0 | 18.9 | 195.0 | 4150.0 | Female |
| 165 | Chinstrap | Dream | 52.0 | 18.1 | 201.0 | 4050.0 | Male |
| 167 | Chinstrap | Dream | 50.5 | 19.6 | 201.0 | 4050.0 | Male |
| 171 | Chinstrap | Dream | 49.2 | 18.2 | 195.0 | 4400.0 | Male |
| 177 | Chinstrap | Dream | 52.0 | 19.0 | 197.0 | 4150.0 | Male |
| 181 | Chinstrap | Dream | 52.8 | 20.0 | 205.0 | 4550.0 | Male |
| 183 | Chinstrap | Dream | 54.2 | 20.8 | 201.0 | 4300.0 | Male |
| 185 | Chinstrap | Dream | 51.0 | 18.8 | 203.0 | 4100.0 | Male |
| 189 | Chinstrap | Dream | 52.0 | 20.7 | 210.0 | 4800.0 | Male |
| 191 | Chinstrap | Dream | 53.5 | 19.9 | 205.0 | 4500.0 | Male |
| 197 | Chinstrap | Dream | 50.8 | 18.5 | 201.0 | 4450.0 | Male |
| 199 | Chinstrap | Dream | 49.0 | 19.6 | 212.0 | 4300.0 | Male |
| 205 | Chinstrap | Dream | 50.7 | 19.7 | 203.0 | 4050.0 | Male |
| 209 | Chinstrap | Dream | 49.3 | 19.9 | 203.0 | 4050.0 | Male |
| 218 | Chinstrap | Dream | 50.8 | 19.0 | 210.0 | 4100.0 | Male |
df[(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)].shape
(28, 7)
df[(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)].shape[0]
28
df['bill_ratio'] = df.bill_length_mm / df.bill_depth_mm
df.head()
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | bill_ratio | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male | 2.090909 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female | 2.270115 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female | 2.238889 |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female | 1.901554 |
df.isna()
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | bill_ratio | |
|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False |
| 3 | False | False | True | True | True | True | True | True |
| 4 | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | False | False | True | True | True | True | True | True |
| 340 | False | False | False | False | False | False | False | False |
| 341 | False | False | False | False | False | False | False | False |
| 342 | False | False | False | False | False | False | False | False |
| 343 | False | False | False | False | False | False | False | False |
344 rows × 8 columns
df.isna().any(axis=1)
0 False
1 False
2 False
3 True
4 False
...
339 True
340 False
341 False
342 False
343 False
Length: 344, dtype: bool
df_missinig = df[df.isna().any(axis=1)]
df_missinig
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | bill_ratio | |
|---|---|---|---|---|---|---|---|---|
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | NaN |
| 8 | Adelie | Torgersen | 34.1 | 18.1 | 193.0 | 3475.0 | NaN | 1.883978 |
| 9 | Adelie | Torgersen | 42.0 | 20.2 | 190.0 | 4250.0 | NaN | 2.079208 |
| 10 | Adelie | Torgersen | 37.8 | 17.1 | 186.0 | 3300.0 | NaN | 2.210526 |
| 11 | Adelie | Torgersen | 37.8 | 17.3 | 180.0 | 3700.0 | NaN | 2.184971 |
| 47 | Adelie | Dream | 37.5 | 18.9 | 179.0 | 2975.0 | NaN | 1.984127 |
| 246 | Gentoo | Biscoe | 44.5 | 14.3 | 216.0 | 4100.0 | NaN | 3.111888 |
| 286 | Gentoo | Biscoe | 46.2 | 14.4 | 214.0 | 4650.0 | NaN | 3.208333 |
| 324 | Gentoo | Biscoe | 47.3 | 13.8 | 216.0 | 4725.0 | NaN | 3.427536 |
| 336 | Gentoo | Biscoe | 44.5 | 15.7 | 217.0 | 4875.0 | NaN | 2.834395 |
| 339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN | NaN |
df_clean = df.dropna()
df_clean
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | bill_ratio | |
|---|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male | 2.090909 |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female | 2.270115 |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female | 2.238889 |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female | 1.901554 |
| 5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | Male | 1.907767 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 338 | Gentoo | Biscoe | 47.2 | 13.7 | 214.0 | 4925.0 | Female | 3.445255 |
| 340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female | 3.272727 |
| 341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male | 3.210191 |
| 342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female | 3.054054 |
| 343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male | 3.099379 |
333 rows × 8 columns
sns.scatterplot(
data = df,
x = "bill_length_mm",
y = "bill_depth_mm",
hue = "species"
)
<Axes: xlabel='bill_length_mm', ylabel='bill_depth_mm'>
import altair as alt
alt.Chart(df).mark_circle().encode(
x = alt.X("bill_length_mm",scale=alt.Scale(domain=(30, 60))),
y = alt.X("bill_depth_mm",scale=alt.Scale(domain=(12, 22))),
color = "species",
tooltip = ["species", "bill_length_mm", "bill_depth_mm", "island", "sex"]
)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
col = df[col_name].apply(to_list_if_array, convert_dtype=False)
brush = alt.selection_interval()
c1 = alt.Chart(df).mark_point().encode(
x = alt.X("bill_length_mm",scale=alt.Scale(domain=(30, 60))),
y = alt.X("bill_depth_mm",scale=alt.Scale(domain=(12, 22))),
color = "species:N",
tooltip = ["species", "bill_length_mm", "bill_depth_mm", "island", "sex"]
).add_params(brush)
c2 = alt.Chart(df).mark_bar().encode(
x="species:N",
y="count()"
).transform_filter(brush)
c1|c2
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
col = df[col_name].apply(to_list_if_array, convert_dtype=False)
import plotly.express as px
px.scatter(
data_frame=df,
x = "bill_length_mm",
y = "bill_depth_mm",
color = "species",
hover_data = ["species", "bill_length_mm", "bill_depth_mm", "island"]
)
sns.displot(df, x="flipper_length_mm", hue="species")
<seaborn.axisgrid.FacetGrid at 0x18fcbfaa0>
sns.violinplot(data=df, x="species", y="flipper_length_mm",hue="sex")
<Axes: xlabel='species', ylabel='flipper_length_mm'>
sns.pairplot(df, diag_kind="hist", hue = 'species')
<seaborn.axisgrid.PairGrid at 0x18fe09a00>