#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2018-10-11 17:55:26 # @Author : Sheldon (thisisscret@qq.com) # @blogs : 謝耳朵的派森筆記 # @Link : https://www.cnblogs.com/shld/
import pandas as pd from joblib import Parallel, delayed def apply_parallel(df, func, n=-2): """利用 Parallel 和 delayed 函數實現並行運算,思路是把Dataframe分割喂給包含apply的函數 @params df: 需要apply的Dataframe @params func: 包含apply的函數,(不是apply的參數那個函數),需自己定義, @params n: n為線程數,默認為cpu數-1,-1為cpu數,可自定義 @return Dataframe: apply參數函數得到的Dataframe
""" if n is None: n = -1 dflength = len(df) cpunum = cpu_count() if dflength<cpunum: spnum = dflength if n<0: spnum = cpunum+n+1 else: spnum = n or 1 sp = list(range(dflength)[::int(dflength/spnum+0.5)]) sp.append(dflength) slice_gen = (slice(*idx) for idx in zip(sp[:-1],sp[1:])) results = Parallel(n_jobs=n)(delayed(func)(df[slc]) for slc in slice_gen) return pd.concat(results)