Week 3, Wed, 4/16

Week 3, Wed, 4/16#

import seaborn as sns
df = sns.load_dataset("penguins")
df
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male

344 rows × 7 columns

df['body_mass_g2'] = df['body_mass_g'].apply(lambda x: x/1000)
df
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex body_mass_g2
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male 3.75
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female 3.80
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female 3.25
3 Adelie Torgersen NaN NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female 3.45
... ... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female 4.85
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male 5.75
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female 5.20
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male 5.40

344 rows × 8 columns

df.rename(columns={'body_mass_g2':'body_mass_kg','sex':'gender'},inplace=True)
df
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g gender body_mass_kg
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male 3.75
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female 3.80
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female 3.25
3 Adelie Torgersen NaN NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female 3.45
... ... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female 4.85
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male 5.75
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female 5.20
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male 5.40

344 rows × 8 columns

idx = df['body_mass_g'].idxmax()
idx
237
df.loc[idx, 'bill_length_mm']
49.2
df[(df.island=='Dream') & (df.body_mass_g>4000)].shape
(28, 8)
import seaborn as sns
df = sns.load_dataset("penguins")
df
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male

344 rows × 7 columns

df_clean = df.dropna()
df_clean
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
5 Adelie Torgersen 39.3 20.6 190.0 3650.0 Male
... ... ... ... ... ... ... ...
338 Gentoo Biscoe 47.2 13.7 214.0 4925.0 Female
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male

333 rows × 7 columns

df[df.isna().any(axis=1)]
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
3 Adelie Torgersen NaN NaN NaN NaN NaN
8 Adelie Torgersen 34.1 18.1 193.0 3475.0 NaN
9 Adelie Torgersen 42.0 20.2 190.0 4250.0 NaN
10 Adelie Torgersen 37.8 17.1 186.0 3300.0 NaN
11 Adelie Torgersen 37.8 17.3 180.0 3700.0 NaN
47 Adelie Dream 37.5 18.9 179.0 2975.0 NaN
246 Gentoo Biscoe 44.5 14.3 216.0 4100.0 NaN
286 Gentoo Biscoe 46.2 14.4 214.0 4650.0 NaN
324 Gentoo Biscoe 47.3 13.8 216.0 4725.0 NaN
336 Gentoo Biscoe 44.5 15.7 217.0 4875.0 NaN
339 Gentoo Biscoe NaN NaN NaN NaN NaN
df.groupby('species')[['bill_length_mm','bill_depth_mm']].mean()
bill_length_mm bill_depth_mm
species
Adelie 38.791391 18.346358
Chinstrap 48.833824 18.420588
Gentoo 47.504878 14.982114
df.bill_length_mm.corr(df.bill_depth_mm)
-0.23505287035553263
sns.scatterplot(
    data = df,
    x = "bill_length_mm",
    y = "bill_depth_mm",
)
<Axes: xlabel='bill_length_mm', ylabel='bill_depth_mm'>
../_images/6a8020e8b3e636d4ac91797fcf27249223baf99b370a6908f1bcc0d02016449b.png
species = df.species.unique()
species
array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)
for s in species:
    df_s =df[df.species==s]
    corr = df_s.bill_length_mm.corr(df_s.bill_depth_mm)
    print(f"{s}:{corr:.2f}")
Adelie:0.39
Chinstrap:0.65
Gentoo:0.64
sns.scatterplot(
    data = df,
    x = "bill_length_mm",
    y = "bill_depth_mm",
    hue = 'species'
)
<Axes: xlabel='bill_length_mm', ylabel='bill_depth_mm'>
../_images/e80393919a0763225ba40c8e981fb0ed0964e5d8ec9aaa6b6e9ff459c81845ac.png
import altair as alt
alt.Chart(df).mark_circle().encode(
    x = alt.X("bill_length_mm",scale=alt.Scale(domain=(30, 60))),
    y = alt.X("bill_depth_mm",scale=alt.Scale(domain=(12, 22))),
    color = "species",
    tooltip = ["species", "bill_length_mm", "bill_depth_mm", "island", "sex"]
)
brush = alt.selection_interval()

c1 = alt.Chart(df).mark_point().encode(
    x = alt.X("bill_length_mm",scale=alt.Scale(domain=(30, 60))),
    y = alt.X("bill_depth_mm",scale=alt.Scale(domain=(12, 22))),
    color = "species:N",
    tooltip = ["species", "bill_length_mm", "bill_depth_mm", "island", "sex"]
).add_params(brush)

c2 = alt.Chart(df).mark_bar().encode(
    x="species:N",
    y="count()"
).transform_filter(brush)

c1|c2
sns.displot(df, x="flipper_length_mm", hue="species")
<seaborn.axisgrid.FacetGrid at 0x3214c1250>
../_images/a94fcb17f29adfbeed7dd47f83453ff2d5a894e435d3a0eb5f1c28917ae46121.png
sns.violinplot(data=df, x="species", y="flipper_length_mm",hue="sex")
<Axes: xlabel='species', ylabel='flipper_length_mm'>
../_images/e1646fbc10438161e79323ba96fefc36109bbda28cedca3b506e24b9b08107d7.png
sns.pairplot(df, diag_kind="hist", hue = 'species')
<seaborn.axisgrid.PairGrid at 0x323548f50>
../_images/40cf8f3db269debb84df7f8df51cbbc4cd9d1022776fe0ad1f75639fe051f1cd.png