import numpy as np import pandas as pd from multiprocessing import cpu_count, Pool
defparallelize(func, df): """ Split data into max core partitions and execute func in parallel. https://www.machinelearningplus.com/python/parallel-processing-python/ Parameters ---------- df : pandas Dataframe func : any functions Returns ------- data : pandas Dataframe Returned dataframe of func. """ cores = cpu_count() data_split = np.array_split(df, cores) pool = Pool(cores) data = pd.concat(pool.map(func, data_split), ignore_index=True) pool.close() pool.join() return data
如果不是 DataFrame 对象而是一个大的文件对象,对应的处理方法为:
1 2 3 4 5 6
from multiprocessing import cpu_count, Pool pool = Pool(processes=cpu_count()) withopen('test.txt', 'r') as file: rows = pool.map(preprocess, file) # preprocess 返回值会以列表的形式保存在 rows 中 pool.close() df = pd.DataFrame(rows, columns=["timestamp", "cmdb_id", "parent_id", "span_id", "trace_id", "duration"])