Lecture Week 3 Fri 10/19#

import seaborn as sns
df = sns.load_dataset("penguins")

what is the average body mass for adelie penguins#

df
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male

344 rows × 7 columns

df[df.species == 'Adelie'].body_mass_g.mean()
3700.662251655629
df.species
0      Adelie
1      Adelie
2      Adelie
3      Adelie
4      Adelie
        ...  
339    Gentoo
340    Gentoo
341    Gentoo
342    Gentoo
343    Gentoo
Name: species, Length: 344, dtype: object
df.species == 'Adelie'
0       True
1       True
2       True
3       True
4       True
       ...  
339    False
340    False
341    False
342    False
343    False
Name: species, Length: 344, dtype: bool
df[df.species == 'Adelie']
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
... ... ... ... ... ... ... ...
147 Adelie Dream 36.6 18.4 184.0 3475.0 Female
148 Adelie Dream 36.0 17.8 195.0 3450.0 Female
149 Adelie Dream 37.8 18.1 193.0 3750.0 Male
150 Adelie Dream 36.0 17.1 187.0 3700.0 Female
151 Adelie Dream 41.5 18.5 201.0 4000.0 Male

152 rows × 7 columns

df[df.species == 'Adelie'].body_mass_g
0      3750.0
1      3800.0
2      3250.0
3         NaN
4      3450.0
        ...  
147    3475.0
148    3450.0
149    3750.0
150    3700.0
151    4000.0
Name: body_mass_g, Length: 152, dtype: float64
df[df.species == 'Adelie'].body_mass_g.mean()
3700.662251655629
df.groupby('sex')['body_mass_g'].mean()
sex
Female    3862.272727
Male      4545.684524
Name: body_mass_g, dtype: float64
df.describe()
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
count 342.000000 342.000000 342.000000 342.000000
mean 43.921930 17.151170 200.915205 4201.754386
std 5.459584 1.974793 14.061714 801.954536
min 32.100000 13.100000 172.000000 2700.000000
25% 39.225000 15.600000 190.000000 3550.000000
50% 44.450000 17.300000 197.000000 4050.000000
75% 48.500000 18.700000 213.000000 4750.000000
max 59.600000 21.500000 231.000000 6300.000000
# loc : use row and column label
df.loc[0:2, ['species', 'island']]
species island
0 Adelie Torgersen
1 Adelie Torgersen
2 Adelie Torgersen
# iloc : use row and column number
# follow python convention, end point not included
df.iloc[0:2, 0:3]
species island bill_length_mm
0 Adelie Torgersen 39.1
1 Adelie Torgersen 39.5
df.iloc[0:2, [1,3,5]]
island bill_depth_mm body_mass_g
0 Torgersen 18.7 3750.0
1 Torgersen 17.4 3800.0

Q: What is the bill length of the heaviest penguin in the dataset?#

df.body_mass_g
0      3750.0
1      3800.0
2      3250.0
3         NaN
4      3450.0
        ...  
339       NaN
340    4850.0
341    5750.0
342    5200.0
343    5400.0
Name: body_mass_g, Length: 344, dtype: float64
df.body_mass_g.idxmax()
237
df.loc[df.body_mass_g.idxmax()]
species              Gentoo
island               Biscoe
bill_length_mm         49.2
bill_depth_mm          15.2
flipper_length_mm     221.0
body_mass_g          6300.0
sex                    Male
Name: 237, dtype: object
df.loc[df.body_mass_g.idxmax(),'bill_length_mm']
49.2

Q: How many penguins are on Dream island and have body mass greater than 4000 grams?#

df['island'] == 'Dream'
0      False
1      False
2      False
3      False
4      False
       ...  
339    False
340    False
341    False
342    False
343    False
Name: island, Length: 344, dtype: bool
df['body_mass_g'] > 4000
0      False
1      False
2      False
3      False
4      False
       ...  
339    False
340     True
341     True
342     True
343     True
Name: body_mass_g, Length: 344, dtype: bool
(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)
0      False
1      False
2      False
3      False
4      False
       ...  
339    False
340    False
341    False
342    False
343    False
Length: 344, dtype: bool
df[(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)]
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
35 Adelie Dream 39.2 21.1 196.0 4150.0 Male
39 Adelie Dream 39.8 19.1 184.0 4650.0 Male
43 Adelie Dream 44.1 19.7 196.0 4400.0 Male
45 Adelie Dream 39.6 18.8 190.0 4600.0 Male
49 Adelie Dream 42.3 21.2 191.0 4150.0 Male
91 Adelie Dream 41.1 18.1 205.0 4300.0 Male
93 Adelie Dream 39.6 18.1 186.0 4450.0 Male
95 Adelie Dream 40.8 18.9 208.0 4300.0 Male
97 Adelie Dream 40.3 18.5 196.0 4350.0 Male
99 Adelie Dream 43.2 18.5 192.0 4100.0 Male
133 Adelie Dream 37.5 18.5 199.0 4475.0 Male
139 Adelie Dream 39.7 17.9 193.0 4250.0 Male
146 Adelie Dream 39.2 18.6 190.0 4250.0 Male
160 Chinstrap Dream 46.0 18.9 195.0 4150.0 Female
165 Chinstrap Dream 52.0 18.1 201.0 4050.0 Male
167 Chinstrap Dream 50.5 19.6 201.0 4050.0 Male
171 Chinstrap Dream 49.2 18.2 195.0 4400.0 Male
177 Chinstrap Dream 52.0 19.0 197.0 4150.0 Male
181 Chinstrap Dream 52.8 20.0 205.0 4550.0 Male
183 Chinstrap Dream 54.2 20.8 201.0 4300.0 Male
185 Chinstrap Dream 51.0 18.8 203.0 4100.0 Male
189 Chinstrap Dream 52.0 20.7 210.0 4800.0 Male
191 Chinstrap Dream 53.5 19.9 205.0 4500.0 Male
197 Chinstrap Dream 50.8 18.5 201.0 4450.0 Male
199 Chinstrap Dream 49.0 19.6 212.0 4300.0 Male
205 Chinstrap Dream 50.7 19.7 203.0 4050.0 Male
209 Chinstrap Dream 49.3 19.9 203.0 4050.0 Male
218 Chinstrap Dream 50.8 19.0 210.0 4100.0 Male
df[(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)].shape
(28, 7)
df[(df['island'] == 'Dream') & (df['body_mass_g'] > 4000)].shape[0]
28
df['bill_ratio'] = df.bill_length_mm / df.bill_depth_mm
df.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex bill_ratio
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male 2.090909
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female 2.270115
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female 2.238889
3 Adelie Torgersen NaN NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female 1.901554
df.isna()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex bill_ratio
0 False False False False False False False False
1 False False False False False False False False
2 False False False False False False False False
3 False False True True True True True True
4 False False False False False False False False
... ... ... ... ... ... ... ... ...
339 False False True True True True True True
340 False False False False False False False False
341 False False False False False False False False
342 False False False False False False False False
343 False False False False False False False False

344 rows × 8 columns

df.isna().any(axis=1)
0      False
1      False
2      False
3       True
4      False
       ...  
339     True
340    False
341    False
342    False
343    False
Length: 344, dtype: bool
df_missinig = df[df.isna().any(axis=1)]
df_missinig
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex bill_ratio
3 Adelie Torgersen NaN NaN NaN NaN NaN NaN
8 Adelie Torgersen 34.1 18.1 193.0 3475.0 NaN 1.883978
9 Adelie Torgersen 42.0 20.2 190.0 4250.0 NaN 2.079208
10 Adelie Torgersen 37.8 17.1 186.0 3300.0 NaN 2.210526
11 Adelie Torgersen 37.8 17.3 180.0 3700.0 NaN 2.184971
47 Adelie Dream 37.5 18.9 179.0 2975.0 NaN 1.984127
246 Gentoo Biscoe 44.5 14.3 216.0 4100.0 NaN 3.111888
286 Gentoo Biscoe 46.2 14.4 214.0 4650.0 NaN 3.208333
324 Gentoo Biscoe 47.3 13.8 216.0 4725.0 NaN 3.427536
336 Gentoo Biscoe 44.5 15.7 217.0 4875.0 NaN 2.834395
339 Gentoo Biscoe NaN NaN NaN NaN NaN NaN
df_clean = df.dropna()
df_clean
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex bill_ratio
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male 2.090909
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female 2.270115
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female 2.238889
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female 1.901554
5 Adelie Torgersen 39.3 20.6 190.0 3650.0 Male 1.907767
... ... ... ... ... ... ... ... ...
338 Gentoo Biscoe 47.2 13.7 214.0 4925.0 Female 3.445255
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 Female 3.272727
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 Male 3.210191
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 Female 3.054054
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 Male 3.099379

333 rows × 8 columns

sns.scatterplot(
    data = df,
    x = "bill_length_mm",
    y = "bill_depth_mm",
    hue = "species"
)
<Axes: xlabel='bill_length_mm', ylabel='bill_depth_mm'>
../_images/89b6ebf0c4ec0f0bd6de8ee359833330428769527dc0e45295a04458027e31ab.png
import altair as alt
alt.Chart(df).mark_circle().encode(
    x = alt.X("bill_length_mm",scale=alt.Scale(domain=(30, 60))),
    y = alt.X("bill_depth_mm",scale=alt.Scale(domain=(12, 22))),
    color = "species",
    tooltip = ["species", "bill_length_mm", "bill_depth_mm", "island", "sex"]
)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
brush = alt.selection_interval()

c1 = alt.Chart(df).mark_point().encode(
    x = alt.X("bill_length_mm",scale=alt.Scale(domain=(30, 60))),
    y = alt.X("bill_depth_mm",scale=alt.Scale(domain=(12, 22))),
    color = "species:N",
    tooltip = ["species", "bill_length_mm", "bill_depth_mm", "island", "sex"]
).add_params(brush)

c2 = alt.Chart(df).mark_bar().encode(
    x="species:N",
    y="count()"
).transform_filter(brush)

c1|c2
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
/Users/Ray/opt/anaconda3/envs/math10/lib/python3.12/site-packages/altair/utils/core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
import plotly.express as px
px.scatter(
    data_frame=df,
    x = "bill_length_mm",
    y = "bill_depth_mm",
    color = "species",
    hover_data = ["species", "bill_length_mm", "bill_depth_mm", "island"]
)
sns.displot(df, x="flipper_length_mm", hue="species")
<seaborn.axisgrid.FacetGrid at 0x18fcbfaa0>
../_images/599b884770168e6c3b1c954922721e14d5e8717e1f0d58ab0b33e1f92f27ebd6.png
sns.violinplot(data=df, x="species", y="flipper_length_mm",hue="sex")
<Axes: xlabel='species', ylabel='flipper_length_mm'>
../_images/212e783d08336b00f0f298487f55865fbd845afc4fb43d42b8122fde47b3497b.png
sns.pairplot(df, diag_kind="hist", hue = 'species')
<seaborn.axisgrid.PairGrid at 0x18fe09a00>
../_images/7a655c01123ef23c42e07f7f5b6600bd69699eeb92dc8f21fd3808b3cea5330b.png