Pandas实用功能详解:数据处理与统计分析
一、数值计算与统计分析
1. 基本统计方法
import pandas as pd
import numpy as np
# 创建示例DataFrame
data = pd.DataFrame({
'col_a': np.arange(10),
'col_b': np.random.rand(10) * 10
})
print(data)
print('---')
# 非空值计数
print(data.count(), '→ count统计非NaN值的数量\n')
# 最小值与最大值
print(data.min(), '→ min统计最小值')
print(data['col_b'].max(), '→ max统计最大值\n')
# 分位数计算
print(data.quantile(q=0.75), '→ quantile统计分位数\n')
# 求和及均值
print(data.sum(), '→ sum求和')
print(data.mean(), '→ mean求平均值\n')
# 中位数
print(data.median(), '→ median求算数中位数(50%分位数)\n')
# 标准差与方差
print(data.std(), '→ 标准差')
print(data.var(), '→ 方差\n')
# 偏度与峰度
print(data.skew(), '→ skew样本的偏度')
print(data.kurt(), '→ kurt样本的峰度\n')
# 累计和
data['cumsum_a'] = data['col_a'].cumsum()
data['cumsum_b'] = data['col_b'].cumsum()
print(data, '→ cumsum样本的累计和\n')
# 累计积
data['cumprod_a'] = data['col_a'].cumprod()
data['cumprod_b'] = data['col_b'].cumprod()
print(data, '→ cumprod样本的累计积\n')
# 累计最大值和累计最小值
print(data.cummax(), '→ 累计最大值')
print(data.cummin(), '→ 累计最小值')
2. 唯一值处理
# 使用unique()获取唯一值
series = pd.Series(list('asdvasdcfgg'))
unique_vals = series.unique()
print(series)
print(unique_vals, type(unique_vals))
# 转换为新Series
print(pd.Series(unique_vals))
# 排序
unique_vals.sort()
print(unique_vals)
3. 值频率统计
# value_counts()统计各值出现频率
series = pd.Series(list('asdvasdcfgg'))
freq = series.value_counts(sort=False)
print(freq)
4. 成员资格检查
# isin()检查元素是否在指定集合中
series = pd.Series(np.arange(10, 15))
df = pd.DataFrame({
'col1': list('asdcbvasd'),
'col2': np.arange(4, 13)
})
print(series)
print(df)
print('=' * 60)
# 返回布尔Series或DataFrame
print(series.isin([5, 14]))
print(df.isin(['a', 'bc', '10', 8]))
二、文本数据处理
Pandas通过.str访问器提供便捷的字符串操作方法,自动处理缺失值。
1. 基础字符串访问
# 通过str访问字符串方法,自动跳过NaN
series = pd.Series(['A', 'b', 'C', 'bbhello', '123', np.nan, 'hj'])
df = pd.DataFrame({
'col1': list('abcdef'),
'col2': ['hee', 'fv', 'w', 'hija', '123', np.nan]
})
print(series)
print(df)
print('=' * 60)
# 调用字符串方法
print(series.str.count('b'))
print(df['col2'].str.upper())
print('=' * 60)
# columns也支持.str操作
df.columns = df.columns.str.upper()
print(df)
2. 常用字符串操作
# 大小写转换、长度、前后缀判断
series = pd.Series(['A', 'b', 'bbhello', '123', np.nan])
print(series.str.lower(), '→ lower小写')
print(series.str.upper(), '→ upper大写')
print(series.str.len(), '→ len字符长度')
print(series.str.startswith('b'), '→ 判断起始')
print(series.str.endswith('3'), '→ 判断结束')
# 去除空格
series = pd.Series([' jack', 'jill ', ' jesse ', 'frank'])
df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '])
print(series)
print(df)
print('---')
print(series.str.strip())
print(series.str.lstrip())
print(series.str.rstrip())
df.columns = df.columns.str.strip()
print(df)
# 字符串替换
df = pd.DataFrame(np.random.randn(3, 2), columns=[' Column A ', ' Column B '])
df.columns = df.columns.str.replace(' ', '-')
print(df)
# 限制替换次数
df.columns = df.columns.str.replace('-', 'hehe', n=1)
print(df)
# 字符串分割
series = pd.Series(['a,b,c', '1,2,3', ['a,,,c'], np.nan])
print(series.str.split(','))
print('---')
# 访问分割结果
print(series.str.split(',')[0])
print('---')
# 使用get或[]访问元素
print(series.str.split(',').str[0])
print(series.str.split(',').str.get(1))
print('---')
# 扩展为DataFrame
print(series.str.split(',', expand=True))
print(series.str.split(',', expand=True, n=1))
print(series.str.rsplit(',', expand=True, n=1))
print('---')
# DataFrame中的split
df = pd.DataFrame({
'col1': ['a,b,c', '1,2,3', [':,., ']],
'col2': ['a-b-c', '1-2-3', [':-.- ']]
})
print(df['col2'].str.split('-'))
# 字符串索引
series = pd.Series(['A', 'b', 'C', 'bbhello', '123', np.nan, 'hj'])
df = pd.DataFrame({
'col1': list('abcdef'),
'col2': ['hee', 'fv', 'w', 'hija', '123', np.nan]
})
print(series.str[0])
print(series.str[:2])
print(df['col2'].str[0])
三、数据合并操作
Pandas提供类似SQL的合并功能,支持高性能内存连接。
1. merge合并
# merge基本用法
left = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
# 单键合并
print(pd.merge(left, right, on='key'))
# 多键合并
df1 = pd.DataFrame({
'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
df2 = pd.DataFrame({
'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
print(pd.merge(df1, df2, on=['key1', 'key2']))
# 合并方式:inner/outer/left/right
print(pd.merge(df1, df2, on=['key1', 'key2'], how='inner'))
print('---')
print(pd.merge(df1, df2, on=['key1', 'key2'], how='outer'))
print('---')
print(pd.merge(df1, df2, on=['key1', 'key2'], how='left'))
print('---')
print(pd.merge(df1, df2, on=['key1', 'key2'], how='right'))
# 使用不同键名
df1 = pd.DataFrame({'lkey': list('bbacaab'), 'data1': range(7)})
df2 = pd.DataFrame({'rkey': list('abd'), 'date2': range(3)})
print(pd.merge(df1, df2, left_on='lkey', right_on='rkey'))
# 键与索引组合
df1 = pd.DataFrame({'key': list('abcdfeg'), 'data1': range(7)})
df2 = pd.DataFrame({'date2': range(100, 105)}, index=list('abcde'))
print(pd.merge(df1, df2, left_on='key', right_index=True))
# 排序控制
df1 = pd.DataFrame({'key': list('bbacaab'), 'data1': [1, 3, 2, 4, 5, 9, 7]})
df2 = pd.DataFrame({'key': list('abd'), 'date2': [11, 2, 33]})
x1 = pd.merge(df1, df2, on='key', how='outer')
x2 = pd.merge(df1, df2, on='key', sort=True, how='outer')
print(x1)
print(x2)
print('---')
print(x2.sort_values('data1'))
2. join索引连接
# join方法:基于索引连接
left = pd.DataFrame({
'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']
}, index=['K0', 'K1', 'K2'])
right = pd.DataFrame({
'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']
}, index=['K0', 'K2', 'K3'])
print(left)
print(right)
print(left.join(right))
print(left.join(right, how='outer'))
# 带键连接
df1 = pd.DataFrame({'key': list('bbacaab'), 'data1': [1, 3, 2, 4, 5, 9, 7]})
df2 = pd.DataFrame({'key': list('abd'), 'date2': [11, 2, 33]})
print(df1)
print(df2)
print(pd.merge(df1, df2, left_index=True, right_index=True, suffixes=('_1', '_2')))
print(df1.join(df2['date2']))
print('---')
# 指定连接键
left = pd.DataFrame({
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']
})
right = pd.DataFrame({
'C': ['C0', 'C1'],
'D': ['D0', 'D1']
}, index=['K0', 'K1'])
print(left)
print(right)
print(left.join(right, on='key'))
四、连接与填充
# 1. concat连接
s1 = pd.Series([1, 2, 3])
s2 = pd.Series([2, 3, 4])
s3 = pd.Series([1, 2, 3], index=['a', 'c', 'h'])
s4 = pd.Series([2, 3, 4], index=['b', 'e', 'd'])
print(pd.concat([s1, s2]))
print(pd.concat([s3, s4]).sort_index())
# 按列连接
print(pd.concat([s3, s4], axis=1))
# 2. 连接方式控制
s5 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s6 = pd.Series([2, 3, 4], index=['b', 'c', 'd'])
print(pd.concat([s5, s6], axis=1))
print(pd.concat([s5, s6], axis=1, join='inner'))
print(pd.concat([s5, s6], axis=1, join_axes=[['a', 'b', 'd']]))
# 3. 设置键名
result = pd.concat([s5, s6], keys=['one', 'two'])
print(result, type(result))
print(result.index)
result = pd.concat([s5, s6], axis=1, keys=['one', 'two'])
print(result, type(result))
# 4. combine_first填充
df1 = pd.DataFrame([
[np.nan, 3., 5.],
[-4.6, np.nan, np.nan],
[np.nan, 7., np.nan]
])
df2 = pd.DataFrame([
[-42.6, np.nan, -8.2],
[-5., 1.6, 4]
], index=[1, 2])
print(df1)
print(df2)
print(df1.combine_first(df2))
# update直接覆盖
df1.update(df2)
print(df1)
五、去重与替换
# 1. 去重操作
series = pd.Series([1, 1, 1, 1, 2, 2, 2, 3, 4, 5, 5, 5, 5])
print(series.duplicated())
print(series[series.duplicated() == False])
# drop_duplicates移除重复
result = series.drop_duplicates()
print(result)
# DataFrame中的去重
df = pd.DataFrame({
'col1': ['a', 'a', 3, 4, 5],
'col2': ['a', 'a', 'b', 'b', 'c']
})
print(df.duplicated())
print(df['col2'].duplicated())
# 2. 替换操作
series = pd.Series(list('ascaazsd'))
print(series.replace('a', np.nan))
print(series.replace(['a', 's'], np.nan))
print(series.replace({'a': 'hello world!', 's': 123}))
六、数据分组
1. 基本分组
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)
})
print(df)
print('=' * 60)
# groupby返回中间对象
print(df.groupby('A'), type(df.groupby('A')))
print('=' * 60)
# 分组后计算
a = df.groupby('A').mean(numeric_only=True)
b = df.groupby(['A', 'B']).mean()
c = df.groupby(['A'])['D'].mean()
print(a, type(a), '\n', a.columns)
print(b, type(b), '\n', b.columns)
print(c, type(c))
2. 可迭代分组
df = pd.DataFrame({
'X': ['A', 'B', 'A', 'B'],
'Y': [1, 4, 3, 2]
})
print(df)
print(df.groupby('X'), type(df.groupby('X')))
# 迭代分组
print(list(df.groupby('X')))
print(list(df.groupby('X'))[0])
for n, g in df.groupby('X'):
print(n)
print(g)
print('###')
# 提取特定组
print(df.groupby(['X']).get_group('A'))
# groups属性
grouped = df.groupby(['X'])
print(grouped.groups)
print(grouped.groups['A'])
# 分组长度
print(grouped.size(), type(grouped.size()))
print('---')
# 多列分组
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)
})
grouped = df.groupby(['A', 'B']).groups
print(df)
print(grouped)
print(grouped[('foo', 'three')])
3. 按轴分组
df = pd.DataFrame({
'data1': np.random.rand(2),
'data2': np.random.rand(2),
'key1': ['a', 'b'],
'key2': ['one', 'two']
})
print(df)
print(df.dtypes)
print('---')
for n, p in df.groupby(df.dtypes, axis=1):
print(n)
print(p)
print('##')
4. 字典或Series分组
df = pd.DataFrame(np.arange(16).reshape(4, 4), columns=['a', 'b', 'c', 'd'])
print(df)
# 字典分组
mapping = {'a': 'one', 'b': 'one', 'c': 'two', 'd': 'two', 'e': 'three'}
grouped = df.groupby(mapping, axis=1)
print(grouped.sum())
# Series分组
s = pd.Series(mapping)
print(s)
print(s.groupby(s).count())
5. 函数分组
df = pd.DataFrame(
np.arange(16).reshape(4, 4),
columns=['a', 'b', 'c', 'd'],
index=['abc', 'bcd', 'aa', 'b']
)
print(df)
print(df.groupby(len).sum())
6. 分组计算函数
series = pd.Series([1, 2, 3, 10, 20, 30], index=[1, 2, 3, 1, 2, 3])
grouped = series.groupby(level=0)
print(grouped)
print(grouped.first(), '→ first:非NaN的第一个值')
print(grouped.last(), '→ last:非NaN的最后一个值')
print(grouped.sum(), '→ sum:非NaN的和')
print(grouped.mean(), '→ mean:非NaN的平均值')
print(grouped.median(), '→ median:非NaN的算术中位数')
print(grouped.count(), '→ count:非NaN的值')
print(grouped.min(), '→ min、max:非NaN的最小值、最大值')
print(grouped.std(), '→ std,var:非NaN的标准差和方差')
print(grouped.prod(), '→ prod:非NaN的积')
七、多函数计算
df = pd.DataFrame({
'a': [1, 1, 2, 2],
'b': np.random.rand(4),
'c': np.random.rand(4),
'd': np.random.rand(4)
})
print(df)
# 使用agg应用多个函数
print(df.groupby('a').agg(['mean', np.sum]))
print(df.groupby('a')['b'].agg([('B', np.mean), ('C', np.sum)]))
八、转换与分组应用
1. transform转换
df = pd.DataFrame({
'data1': np.random.rand(5),
'data2': np.random.rand(5),
'key1': list('aabba'),
'key2': ['one', 'two', 'one', 'two', 'one']
})
k_mean = df.groupby('key1').mean(numeric_only=True)
print(df)
print(k_mean)
# 合并分组均值
print(pd.merge(df, k_mean, left_on='key1', right_index=True).add_prefix('mean_'))
# 计算分组均值
print(df.groupby('key2').mean(numeric_only=True))
2. apply通用方法
df = pd.DataFrame({
'data1': np.random.rand(5),
'data2': np.random.rand(5),
'key1': list('aabba'),
'key2': ['one', 'two', 'one', 'two', 'one']
})
# apply应用函数
print(df.groupby('key1').apply(lambda x: x.describe()))
# 自定义函数
def select_head(d, n):
return d.sort_index()[:n]
def select_column(d, col):
return d[col]
print(df.groupby('key1').apply(select_head, 2))
print(df.groupby('key1').apply(select_column, 'data2'))
print(type(df.groupby('key1').apply(select_column, 'data2')))
九、透视表与交叉表
1. 透视表
# pivot_table基本用法
dates = ['2017-5-1', '2017-5-2', '2017-5-3'] * 3
rng = pd.to_datetime(dates)
df = pd.DataFrame({
'date': rng,
'key': list('abcdabcda'),
'values': np.random.rand(9) * 10
})
print(df)
# 创建透视表
print(pd.pivot_table(
df,
values='values',
index='date',
columns='key',
aggfunc=np.sum
))
# 多键透视
print(pd.pivot_table(
df,
values='values',
index=['date', 'key'],
aggfunc=len
))
2. 交叉表
df = pd.DataFrame({
'A': [1, 2, 2, 2, 2],
'B': [3, 3, 4, 4, 4],
'C': [1, 1, np.nan, 1, 1]
})
# 基本交叉表
print(pd.crosstab(df['A'], df['B']))
# 归一化
print(pd.crosstab(df['A'], df['B'], normalize=True))
# 带聚合值的交叉表
print(pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum))
# 添加统计边距
print(pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, margins=True))
十、数据读取
1. 读取分隔文本
import os
os.chdir('./data')
# read_table读取文本文件
data1 = pd.read_table('data1.txt', delimiter=',', header=0, index_col=1)
print(data1)
2. 读取CSV
# read_csv读取CSV文件
data2 = pd.read_csv('data2.csv', engine='python')
print(data2.head())
3. 读取Excel
# read_excel读取Excel文件
data3 = pd.read_excel(
'sample_data.xlsx',
sheetname='Sheet1',
header=0
)
print(data3)
十一、数据导出
Pandas支持多种数据导出格式,包括CSV、Excel、SQL等。