Python 使用 matplotlib 画图

本贴最后更新于 2264 天前,其中的信息可能已经东海扬尘

数据获取

import pandas as pd
import urllib.request
import tempfile
import shutil
import zipfile
import matplotlib
import numpy as np
from matplotlib import pyplot as plt

# 获取数据
temp_dir = tempfile.mkdtemp()
data_source = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
zipname = temp_dir + '/Bike-Sharing-Dataset.zip'
urllib.request.urlretrieve(data_source, zipname)

# 解压
zip_ref = zipfile.ZipFile(zipname, 'r')
zip_ref.extractall(temp_dir)
zip_ref.close()

# 读取数据
daily_path = temp_dir + '/day.csv'
daily_data = pd.read_csv(daily_path)
# 把字符串数据传换成日期数据
daily_data['dteday'] = pd.to_datetime(daily_data['dteday'])
# 不关注的列
drop_list = ['instant', 'season', 'yr', 'mnth', 'holiday', 'workingday', 'weathersit', 'atemp', 'hum']
daily_data.drop(drop_list, inplace=True, axis=1)
shutil.rmtree(temp_dir)
**Attribute Information:**

Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv

- instant: record index
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
+ weathersit : 
- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

参数配置

# 设置图片尺寸 7" x 4"
matplotlib.rc('figure', figsize=(7, 4))
# 设置字体 7
matplotlib.rc('font', size=7)
# 不显示顶部和右侧的坐标线
matplotlib.rc('axes.spines', top=False, right=False)
# 不显示网格
matplotlib.rc('axes', grid=False)
# 设置背景颜色是白色
matplotlib.rc('axes', facecolor='white')

散点图

# 包装一个散点图的函数便于复用
def scatterplot(x_data, y_data, x_label, y_label, title):
    # 创建一个绘图对象
    fig, ax = plt.subplots()

    # 设置数据、点的大小、点的颜色和透明度
    # http://www.114la.com/other/rgb.htm  ax.scatter(x_data, y_data, s=10, color='#539caf', alpha=0.75)

    # 添加标题和坐标说明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 绘制散点图
scatterplot(x_data=daily_data['temp'],
            y_data=daily_data['cnt'],
            x_label='Normalized temperature (C)',
            y_label='Check outs',
            title='Number of Check Outs vs Temperature')

76cccae51e484ac998be143b1fe75e15-image.png

曲线图

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

# 线性回归增加常数项 y=kx+b
x = sm.add_constant(daily_data['temp'])
y = daily_data['cnt']
# 普通最小二乘模型,ordinary least square model
regr = sm.OLS(y, x)
res = regr.fit()
# 从模型获得拟合数据
# 置信水平alpha=5%,st数据汇总,data数据详情,ss2数据列名
st, data, ss2 = summary_table(res, alpha=0.05)
fitted_values = data[:, 2]

# 包装曲线绘制函数
def lineplot(x_data, y_data, x_label, y_label, title):
    # 创建绘图对象
    _, ax = plt.subplots()

    # 绘制拟合曲线,lw=linewidth,alpha=透明度
    ax.plot(x_data, y_data, lw=2, color='#539caf', alpha=1)

    # 添加标题和坐标说明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 调用绘图函数
lineplot(x_data=daily_data['temp'],
         y_data=fitted_values,
         x_label='Normalized temperature (C)',
         y_label='Check outs',
         title='Line of Best Fit for Number of Check Outs vs Temperature')

033f6563d5ac44f4a120c55bbc9d75f1-image.png

带置信区间的曲线图

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

# 线性回归增加常数项 y=kx+b
x = sm.add_constant(daily_data['temp'])
y = daily_data['cnt']
# 普通最小二乘模型,ordinary least square model
regr = sm.OLS(y, x)
res = regr.fit()
# 从模型获得拟合数据
# 置信水平alpha=5%,st数据汇总,data数据详情,ss2数据列名
st, data, ss2 = summary_table(res, alpha=0.05)
fitted_values = data[:, 2]

# 获得5%置信区间的上下界
predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T

# 创建置信区间DataFrame,上下界
CI_df = pd.DataFrame(columns=['x_data', 'low_CI', 'upper_CI'])
CI_df['x_data'] = daily_data['temp']
CI_df['low_CI'] = predict_mean_ci_low
CI_df['upper_CI'] = predict_mean_ci_upp
# 根据x_data进行排序
CI_df.sort_values('x_data', inplace=True)

# 绘制置信区间
def lineplotCI(x_data, y_data, sorted_x, low_CI, upper_CI, x_label, y_label, title):
    # 创建绘图对象
    _, ax = plt.subplots()

    # 绘制预测曲线
    ax.plot(x_data, y_data, lw=1, color='#539caf', alpha=1, label='Fit')
    # 绘制置信区间,顺序填充
    ax.fill_between(sorted_x, low_CI, upper_CI, color='#539caf', alpha=0.4, label='95% CI')
    # 添加标题和坐标说明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

    # 显示图例,配合label参数,loc=“best”自适应方式
    ax.legend(loc='best')

# 调用绘图函数
lineplotCI(x_data=daily_data['temp'],
    y_data=fitted_values,
    sorted_x=CI_df['x_data'],
    low_CI=CI_df['low_CI'],
    upper_CI=CI_df['upper_CI'],
    x_label='Normalized temperature (C)',
    y_label='Check outs',
    title='Line of Best Fit for Number of Check Outs vs Temperature')

a15b18809de84a23bc2d7c257845f7dd-image.png

双坐标曲线图

# 双纵坐标绘图函数
def lineplot2y(x_data, x_label, y1_data, y1_color, y1_label, y2_data, y2_color, y2_label, title):
    _, ax1 = plt.subplots()
    ax1.plot(x_data, y1_data, color=y1_color)
    # 添加标题和坐标说明
    ax1.set_ylabel(y1_label, color=y1_color)
    ax1.set_xlabel(x_label)
    ax1.set_title(title)

    # 两个绘图对象共享横坐标轴
    ax2 = ax1.twinx()
    ax2.plot(x_data, y2_data, color=y2_color)
    ax2.set_ylabel(y2_label, color=y2_color)
    # 右侧坐标轴可见
    ax2.spines['right'].set_visible(True)

# 调用绘图函数
lineplot2y(x_data=daily_data['dteday'],
    x_label='Day',
    y1_data=daily_data['cnt'],
    y1_color='#539caf',
    y1_label='Check outs',
    y2_data=daily_data['windspeed'],
    y2_color='#7663b0',
    y2_label='Normalized windspeed',
    title='Check Outs and Windspeed Over Time')

903b8397a0774ccb9278fc29cca449bc-image.png

灰度图

# 绘制灰度图的函数
def histogram(data, x_label, y_label, title):
    _, ax = plt.subplots()
    # 设置bin的数量
    ax.hist(data, color='#539caf', bins=10)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 绘图函数调用
histogram(data=daily_data['registered'],
    x_label='Check outs',
    y_label='Frequency',
    title='Distribution of Registered Check Outs')

eca194ebfcb34dc8b566673486df8b40-image.png

堆叠直方图

# 绘制堆叠的直方图
def overlaid_historgram(data1, data1_name, data1_color, data2, data2_name, data2_color, x_label, y_label, title):
    # 归一化数据区间,对齐两个直方图的bins
    max_nbins = 10
    data_range = [min(min(data1), min(data2)), max(max(data1), max(data2))]
    binwidth = (data_range[1] - data_range[0]) / max_nbins
    bins = np.arange(data_range[0], data_range[1] + binwidth, binwidth)

    # 创建绘图对象
    _, ax = plt.subplots()
    ax.hist(data1, bins=bins, color=data1_color, alpha=1, label=data1_name)
    ax.hist(data2, bins=bins, color=data2_color, alpha=0.75, label=data2_name)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc='best')

# 绘图函数调用
overlaid_historgram(data1=daily_data['registered'],
    data1_name='Registered',
    data1_color='#539caf',
    data2=daily_data['casual'],
    data2_name='Casual',
    data2_color='#7663b0',
    x_label='Check outs',
    y_label='Frequency',
    title='Distribution of Check Outs By Type')

66805fce28b34edda1deb938e257f799-image.png

密度估计曲线

# 计算概率密度
from scipy.stats import gaussian_kde
data = daily_data['registered']
# kernal density estimate: https://en.wikipedia.org/wiki/Kernel_density_estimation
density_est = gaussian_kde(data)
# 控制平滑程度,数值越大,越平滑
density_est.covariance_factor = lambda: .3
density_est._compute_covariance()
x_data = np.arange(min(data), max(data), 200)

# 绘制密度估计曲线
def densityplot(x_data, density_est, x_label, y_label, title):
    _, ax = plt.subplots()
    ax.plot(x_data, density_est(x_data), color='#539caf', lw=2)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 调用绘图函数
densityplot(x_data=x_data,
  density_est=density_est,
  x_label='Check outs',
  y_label='Frequency',
  title='Distribution of Registered Check Outs')

edbe7de1e00740158b670982c060927d-image.png

柱状图

# 分天分析统计特征
mean_total_co_day = daily_data[['weekday', 'cnt']].groupby('weekday').agg([np.mean, np.std])
mean_total_co_day.columns = mean_total_co_day.columns.droplevel()

# 定义绘制柱状图的函数
def barplot(x_data, y_data, error_data, x_label, y_label, title):
    _, ax = plt.subplots()
    # 柱状图
    ax.bar(x_data, y_data, color='#539caf', align='center')
    # 绘制方差
 # ls='none'去掉bar之间的连线  ax.errorbar(x_data, y_data, yerr=error_data, color='#297083', ls='none', lw=5)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 绘图函数调用
barplot(x_data=mean_total_co_day.index.values,
    y_data=mean_total_co_day['mean'],
    error_data=mean_total_co_day['std'],
    x_label='Day of week',
    y_label='Check outs',
    title='Total Check Outs By Day Of Week (0 = Sunday)')

25dab497aafd4a1892e5bb0c1c2a3a5f-image.png

堆积柱状图

# 分天统计注册和偶然使用的情况
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天统计注册和偶然使用的占比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']

# 绘制堆积柱状图
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 循环绘制堆积柱状图
    for i in range(0, len(y_data_list)):
        if i == 0:
            ax.bar(x_data, y_data_list[i], color=colors[i], align='center', label=y_data_names[i])
        else:
            # 采用堆积的方式,除了第一个分类,后面的分类都从前一个分类的柱状图接着画
 # 用归一化保证最终累积结果为1  ax.bar(x_data, y_data_list[i], color=colors[i], bottom=y_data_list[i-1], align='center', label=y_data_names[i])
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    # 设定图例位置
    ax.legend(loc='upper right')

# 调用绘图函数
stackedbarplot(x_data=mean_by_reg_co_day.index.values,
    y_data_list=[mean_by_reg_co_day['reg_prop'], mean_by_reg_co_day['casual_prop']],
    y_data_names=['Registered', 'Casual'],
    colors=['#539caf', '#7663b0'],
    x_label='Day of week',
    y_label='Proportion of check outs',
    title='Check Outs By Registration Status and Day of Week (0 = Sunday)')

e947382ba99048bba1d5181b5b167fef-image.png

分组柱状图

# 分天统计注册和偶然使用的情况
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天统计注册和偶然使用的占比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']

# 绘制分组柱状图的函数
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 设置每一组柱状图的宽度
  total_width = 0.8
  # 设置每一个柱状图的宽度
  ind_width = total_width / len(y_data_list)
    # 计算每一个柱状图的中心偏移
  alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)

    # 分别绘制每一个柱状图
  for i in range(0, len(y_data_list)):
        # 横向散开绘制
  ax.bar(x_data + alteration[i], y_data_list[i], color=colors[i], label=y_data_names[i], width=ind_width)
        ax.set_ylabel(y_label)
        ax.set_xlabel(x_label)
        ax.set_title(title)
        ax.legend(loc='upper right')

# 调用绘图函数
groupedbarplot(x_data=mean_by_reg_co_day.index.values,
  y_data_list=[mean_by_reg_co_day['registered'], mean_by_reg_co_day['casual']],
  y_data_names=['Registered', 'Casual'],
  colors=['#539caf', '#7663b0'],
  x_label='Day of week',
  y_label='Check outs',
  title='Check Outs By Registration Status and Day of Week (0 = Sunday)')

1772b8b52f8a471eb89c479011019e8d-image.png

箱式图

# 只需要指定分类的依据,就能自动绘制箱式图
days = np.unique(daily_data['weekday'])
bp_data = []
for day in days:
    bp_data.append(daily_data[daily_data['weekday'] == day]['cnt'].values)

# 定义绘图函数
def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
    _, ax = plt.subplots()

    # 设置样式
  ax.boxplot(y_data,
  # 箱子是否颜色填充
  patch_artist=True,
  # 中位数线颜色
  medianprops={'color': base_color},
  # 箱子颜色设置,color:边框颜色,facecolor:填充颜色
  boxprops={'color': base_color, 'facecolor': median_color},
  # 猫须颜色whisker
  whiskerprops={'color': median_color},
  # 猫须界限颜色whisker cap
  capprops={'color': base_color})

    # 箱图与x_data保持一致
  ax.set_xticklabels(x_data)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 调用绘图函数
boxplot(x_data=days,
  y_data=bp_data,
  base_color='b',
  median_color='r',
  x_label='Day of week',
  y_label='Check outs',
  title='Total Check Outs By Day of Week (0 = Sunday)')

81a3e0b31db04d2eb72c18cf36af3142-image.png

来源

@ 寒小阳

  • B3log

    B3log 是一个开源组织,名字来源于“Bulletin Board Blog”缩写,目标是将独立博客与论坛结合,形成一种新的网络社区体验,详细请看 B3log 构思。目前 B3log 已经开源了多款产品:SymSoloVditor思源笔记

    1090 引用 • 3467 回帖 • 297 关注
  • Python

    Python 是一种面向对象、直译式电脑编程语言,具有近二十年的发展历史,成熟且稳定。它包含了一组完善而且容易理解的标准库,能够轻松完成很多常见的任务。它的语法简捷和清晰,尽量使用无异义的英语单词,与其它大多数程序设计语言使用大括号不一样,它使用缩进来定义语句块。

    534 引用 • 671 回帖

相关帖子

欢迎来到这里!

我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。

注册 关于
请输入回帖内容 ...