import pandas
import seaborn as sns
tips = sns.load_dataset("tips")
tips
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
tips.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null category 3 smoker 244 non-null category 4 day 244 non-null category 5 time 244 non-null category 6 size 244 non-null int64 dtypes: category(4), float64(2), int64(1) memory usage: 7.4 KB
tips["sex"].unique()
['Female', 'Male'] Categories (2, object): ['Male', 'Female']
tips["day"].unique()
['Sun', 'Sat', 'Thur', 'Fri'] Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']
tips["time"].unique()
['Dinner', 'Lunch'] Categories (2, object): ['Lunch', 'Dinner']
tips["tip"].mean()
2.9982786885245902
tips["tip"].max()
10.0
tips[tips["smoker"]== "Yes"]
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 56 | 38.01 | 3.00 | Male | Yes | Sat | Dinner | 4 |
| 58 | 11.24 | 1.76 | Male | Yes | Sat | Dinner | 2 |
| 60 | 20.29 | 3.21 | Male | Yes | Sat | Dinner | 2 |
| 61 | 13.81 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 62 | 11.02 | 1.98 | Male | Yes | Sat | Dinner | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 234 | 15.53 | 3.00 | Male | Yes | Sat | Dinner | 2 |
| 236 | 12.60 | 1.00 | Male | Yes | Sat | Dinner | 2 |
| 237 | 32.83 | 1.17 | Male | Yes | Sat | Dinner | 2 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
93 rows × 7 columns
tips.sort_values(by="total_bill", ascending=False)
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 170 | 50.81 | 10.00 | Male | Yes | Sat | Dinner | 3 |
| 212 | 48.33 | 9.00 | Male | No | Sat | Dinner | 4 |
| 59 | 48.27 | 6.73 | Male | No | Sat | Dinner | 4 |
| 156 | 48.17 | 5.00 | Male | No | Sun | Dinner | 6 |
| 182 | 45.35 | 3.50 | Male | Yes | Sun | Dinner | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 149 | 7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
| 111 | 7.25 | 1.00 | Female | No | Sat | Dinner | 1 |
| 172 | 7.25 | 5.15 | Male | Yes | Sun | Dinner | 2 |
| 92 | 5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
| 67 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
244 rows × 7 columns
grouped = tips.groupby("sex")
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff30a2a3ee0>
grouped.get_group("Male")
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 5 | 25.29 | 4.71 | Male | No | Sun | Dinner | 4 |
| 6 | 8.77 | 2.00 | Male | No | Sun | Dinner | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 236 | 12.60 | 1.00 | Male | Yes | Sat | Dinner | 2 |
| 237 | 32.83 | 1.17 | Male | Yes | Sat | Dinner | 2 |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
157 rows × 7 columns
for k, df in grouped:
print(k)
display(df.head(3))
print("\n")
Male
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
Female
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| 11 | 35.26 | 5.00 | Female | No | Sun | Dinner | 4 |
grouped.count()
| total_bill | tip | smoker | day | time | size | |
|---|---|---|---|---|---|---|
| sex | ||||||
| Male | 157 | 157 | 157 | 157 | 157 | 157 |
| Female | 87 | 87 | 87 | 87 | 87 | 87 |
grouped.sum()
| total_bill | tip | size | |
|---|---|---|---|
| sex | |||
| Male | 3256.82 | 485.07 | 413 |
| Female | 1570.95 | 246.51 | 214 |
grouped["total_bill"].max()
sex Male 50.81 Female 44.30 Name: total_bill, dtype: float64
grouped[["total_bill", "tip"]].mean()
| total_bill | tip | |
|---|---|---|
| sex | ||
| Male | 20.744076 | 3.089618 |
| Female | 18.056897 | 2.833448 |
def large_amounts(values, min_value=5):
n = 0
for t in values:
if t >= min_value:
n += 1
return n
grouped[["tip", "total_bill"]].agg(large_amounts)
| tip | total_bill | |
|---|---|---|
| sex | ||
| Male | 20 | 157 |
| Female | 8 | 86 |
grouped[["tip", "total_bill"]].agg({"tip": lambda x: large_amounts(x, min_value=3),
"total_bill": lambda x: large_amounts(x, min_value=15)
})
| tip | total_bill | |
|---|---|---|
| sex | ||
| Male | 82 | 113 |
| Female | 39 | 51 |
def large_tips(df):
df = df.sort_values(by="tip", ascending=False)
return df.head(3)
big_tips = grouped.apply(large_tips)
big_tips
| total_bill | tip | sex | smoker | day | time | size | ||
|---|---|---|---|---|---|---|---|---|
| sex | ||||||||
| Male | 170 | 50.81 | 10.00 | Male | Yes | Sat | Dinner | 3 |
| 212 | 48.33 | 9.00 | Male | No | Sat | Dinner | 4 | |
| 23 | 39.42 | 7.58 | Male | No | Sat | Dinner | 4 | |
| Female | 214 | 28.17 | 6.50 | Female | Yes | Sat | Dinner | 3 |
| 52 | 34.81 | 5.20 | Female | No | Sun | Dinner | 4 | |
| 85 | 34.83 | 5.17 | Female | No | Thur | Lunch | 4 |
tips
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
means = tips.groupby(by=["day", "time"]).mean()
means
| total_bill | tip | size | ||
|---|---|---|---|---|
| day | time | |||
| Thur | Lunch | 17.664754 | 2.767705 | 2.459016 |
| Dinner | 18.780000 | 3.000000 | 2.000000 | |
| Fri | Lunch | 12.845714 | 2.382857 | 2.000000 |
| Dinner | 19.663333 | 2.940000 | 2.166667 | |
| Sat | Lunch | NaN | NaN | NaN |
| Dinner | 20.441379 | 2.993103 | 2.517241 | |
| Sun | Lunch | NaN | NaN | NaN |
| Dinner | 21.410000 | 3.255132 | 2.842105 |
tips[(tips["day"]=="Sat") & (tips["time"] == "Lunch") ]
| total_bill | tip | sex | smoker | day | time | size |
|---|
means.index
MultiIndex([('Thur', 'Lunch'),
('Thur', 'Dinner'),
( 'Fri', 'Lunch'),
( 'Fri', 'Dinner'),
( 'Sat', 'Lunch'),
( 'Sat', 'Dinner'),
( 'Sun', 'Lunch'),
( 'Sun', 'Dinner')],
names=['day', 'time'])
means.loc[('Thur', 'Lunch')]
total_bill 17.664754 tip 2.767705 size 2.459016 Name: (Thur, Lunch), dtype: float64
flights = sns.load_dataset("flights")
flights
| year | month | passengers | |
|---|---|---|---|
| 0 | 1949 | Jan | 112 |
| 1 | 1949 | Feb | 118 |
| 2 | 1949 | Mar | 132 |
| 3 | 1949 | Apr | 129 |
| 4 | 1949 | May | 121 |
| ... | ... | ... | ... |
| 139 | 1960 | Aug | 606 |
| 140 | 1960 | Sep | 508 |
| 141 | 1960 | Oct | 461 |
| 142 | 1960 | Nov | 390 |
| 143 | 1960 | Dec | 432 |
144 rows × 3 columns
flights = flights.set_index("year")
flights
| month | passengers | |
|---|---|---|
| year | ||
| 1949 | Jan | 112 |
| 1949 | Feb | 118 |
| 1949 | Mar | 132 |
| 1949 | Apr | 129 |
| 1949 | May | 121 |
| ... | ... | ... |
| 1960 | Aug | 606 |
| 1960 | Sep | 508 |
| 1960 | Oct | 461 |
| 1960 | Nov | 390 |
| 1960 | Dec | 432 |
144 rows × 2 columns
flights.loc["1950"]
| month | passengers | |
|---|---|---|
| year | ||
| 1950 | Jan | 115 |
| 1950 | Feb | 126 |
| 1950 | Mar | 141 |
| 1950 | Apr | 135 |
| 1950 | May | 125 |
| 1950 | Jun | 149 |
| 1950 | Jul | 170 |
| 1950 | Aug | 170 |
| 1950 | Sep | 158 |
| 1950 | Oct | 133 |
| 1950 | Nov | 114 |
| 1950 | Dec | 140 |
flights = flights.reset_index()
flights
| year | month | passengers | |
|---|---|---|---|
| 0 | 1949 | Jan | 112 |
| 1 | 1949 | Feb | 118 |
| 2 | 1949 | Mar | 132 |
| 3 | 1949 | Apr | 129 |
| 4 | 1949 | May | 121 |
| ... | ... | ... | ... |
| 139 | 1960 | Aug | 606 |
| 140 | 1960 | Sep | 508 |
| 141 | 1960 | Oct | 461 |
| 142 | 1960 | Nov | 390 |
| 143 | 1960 | Dec | 432 |
144 rows × 3 columns
flights = flights.set_index(["year", "month"])
flights
| passengers | ||
|---|---|---|
| year | month | |
| 1949 | Jan | 112 |
| Feb | 118 | |
| Mar | 132 | |
| Apr | 129 | |
| May | 121 | |
| ... | ... | ... |
| 1960 | Aug | 606 |
| Sep | 508 | |
| Oct | 461 | |
| Nov | 390 | |
| Dec | 432 |
144 rows × 1 columns
flights.index
MultiIndex([(1949, 'Jan'),
(1949, 'Feb'),
(1949, 'Mar'),
(1949, 'Apr'),
(1949, 'May'),
(1949, 'Jun'),
(1949, 'Jul'),
(1949, 'Aug'),
(1949, 'Sep'),
(1949, 'Oct'),
...
(1960, 'Mar'),
(1960, 'Apr'),
(1960, 'May'),
(1960, 'Jun'),
(1960, 'Jul'),
(1960, 'Aug'),
(1960, 'Sep'),
(1960, 'Oct'),
(1960, 'Nov'),
(1960, 'Dec')],
names=['year', 'month'], length=144)
flights.loc[(1960, 'Mar')]
passengers 419 Name: (1960, Mar), dtype: int64
flights = flights.unstack()
flights
| passengers | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| month | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
| year | ||||||||||||
| 1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
| 1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
| 1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
| 1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
| 1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
| 1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
| 1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
| 1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
| 1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
| 1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
| 1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
| 1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
flights["Jan"]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3360 try: -> 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: ~/opt/anaconda3/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() ~/opt/anaconda3/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'Jan' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) /var/folders/vd/9gpvwb493r52y4sgtl_fvtvm0000gn/T/ipykernel_12352/3691583124.py in <module> ----> 1 flights["Jan"] ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key) 3455 if is_single_key: 3456 if self.columns.nlevels > 1: -> 3457 return self._getitem_multilevel(key) 3458 indexer = self.columns.get_loc(key) 3459 if is_integer(indexer): ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in _getitem_multilevel(self, key) 3506 def _getitem_multilevel(self, key): 3507 # self.columns is a MultiIndex -> 3508 loc = self.columns.get_loc(key) 3509 if isinstance(loc, (slice, np.ndarray)): 3510 new_columns = self.columns[loc] ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/multi.py in get_loc(self, key, method) 2920 2921 if not isinstance(key, tuple): -> 2922 loc = self._get_level_indexer(key, level=0) 2923 return _maybe_to_slice(loc) 2924 ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer) 3202 else: 3203 -> 3204 idx = self._get_loc_single_level_index(level_index, key) 3205 3206 if level > 0 or self._lexsort_depth == 0: ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/multi.py in _get_loc_single_level_index(self, level_index, key) 2853 return -1 2854 else: -> 2855 return level_index.get_loc(key) 2856 2857 def get_loc(self, key, method=None): ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: -> 3363 raise KeyError(key) from err 3364 3365 if is_scalar(key) and isna(key) and not self.hasnans: KeyError: 'Jan'
flights.columns
MultiIndex([('passengers', 'Jan'),
('passengers', 'Feb'),
('passengers', 'Mar'),
('passengers', 'Apr'),
('passengers', 'May'),
('passengers', 'Jun'),
('passengers', 'Jul'),
('passengers', 'Aug'),
('passengers', 'Sep'),
('passengers', 'Oct'),
('passengers', 'Nov'),
('passengers', 'Dec')],
names=[None, 'month'])
flights[('passengers', 'Jan')]
year 1949 112 1950 115 1951 145 1952 171 1953 196 1954 204 1955 242 1956 284 1957 315 1958 340 1959 360 1960 417 Name: (passengers, Jan), dtype: int64
flights = flights.droplevel(0, axis=1)
flights
| month | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | ||||||||||||
| 1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
| 1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
| 1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
| 1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
| 1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
| 1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
| 1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
| 1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
| 1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
| 1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
| 1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
| 1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
flights.columns
CategoricalIndex(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
'Sep', 'Oct', 'Nov', 'Dec'],
categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', ...], ordered=False, dtype='category', name='month')
flights["Jan"]
year 1949 112 1950 115 1951 145 1952 171 1953 196 1954 204 1955 242 1956 284 1957 315 1958 340 1959 360 1960 417 Name: Jan, dtype: int64
flights
| month | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | ||||||||||||
| 1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
| 1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
| 1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
| 1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
| 1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
| 1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
| 1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
| 1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
| 1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
| 1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
| 1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
| 1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
flights = flights.stack()
flights
year month
1949 Jan 112
Feb 118
Mar 132
Apr 129
May 121
...
1960 Aug 606
Sep 508
Oct 461
Nov 390
Dec 432
Length: 144, dtype: int64