直接上代码,功能的实现说明在注释上😎
1.完整代码
import os
import pandas as pd
import numpy as np
import math
# 获取文件名
def get_data_set(path):
data_directory = path + "/data"
data_files = []
files = os.listdir(data_directory)
files.sort()
for f in files:
try:
data_files.append(f)
except Exception:
msg = traceback.format_exc()
print (msg)
print('Warning: unknown file', f)
return data_files #返回一个文件名列表
def data_read(path,file_name):
data_file_path = path + "/data/"
data_file_path = data_file_path + file_name
name,extension = file_name.split(".")
if extension == "csv":
try:
data = pd.read_csv(data_file_path,encoding = "gbk",header = None)
except:
data = pd.read_csv(data_file_path,encoding = "utf-8",header = None)
elif extension == "txt":
try:
data = pd.read_csv(data_file_path,encoding = "gbk",header = None,sep = "\t")
except:
data = pd.read_csv(data_file_path,encoding = "utf-8",header = None,sep = "\t")
else:
data = pd.read_excel(data_file_path)
return data
def merge(path):
data_files = get_data_set(path)
#获得每个文件的列数
#根据列数将第二个文件与第一个文件合并
#用合并后的文件替换第一个文件,成为新的"第一个文件"
#将"第一个文件",重复上面步骤,继续与后面的文件合并
df1 = data_read(path,data_files[0])
for i in range(len(data_files)-1):
df2 = data_read(path,data_files[i+1])
df1.set_index(df1.columns[0], inplace = True)
df2.set_index(df2.columns[0], inplace = True)
#基于列名为"ID"的列进行的合并,而数据的ID,一般在第一列
#df1.set_index(df1.columns['ID'], inplace = True)
#df2.set_index(df2.columns['ID'], inplace = True)
len1 = df1.shape[1] + 1
len2 = df2.shape[1] + 1
add_data = []
df1['is_null'] = [True if idx in df2.index else False for idx,row in df1.iterrows()]
df1 = df1[df1['is_null']==True]
for index,row in df1.iterrows():
df2_row = df2.loc[index]
row_data = [df2_row[i] for i in range(1,len2)]
add_data.append(row_data)
df1.reset_index(inplace=True)
#df1 + df2 = out
out = pd.concat((df1,pd.DataFrame(add_data,columns=[i for i in range(len1,len1 + df2.shape[1])])),axis = 1)
df1 = out
print (df1.shape)
return df1
def main():
path = "E:/AnaLinReg/Data5/code_test/DataSet1"
result = merge(path)
print ("合并完成")
print (result.shape)
if __name__ == "__main__":
main()
待合并的文件:
d1.csv :69636行25列
d2.csv :69636行31列
d3.csv :69636行16列
d4.csv :69636行17列
d5.csv :69636行27列
合并期间,ID列是共用的列,
2.算法运行结果:
點擊查看更多內容
為 TA 點贊
評論
評論
共同學習,寫下你的評論
評論加載中...
作者其他優質文章
正在加載中
感謝您的支持,我會繼續努力的~
掃碼打賞,你說多少就多少
贊賞金額會直接到老師賬戶
支付方式
打開微信掃一掃,即可進行掃碼打賞哦