2 回答

TA貢獻1725條經驗 獲得超8個贊
$ cd ~
$ more aggregate.csv
X
X
X
X
X
X
$ more ./Desktop/folder/sub-folder/sample.csv
A,1
A,2
A,3
A,4
A,5
$ more ./Desktop/folder/sub-folder/sub-sub-folder/sample.csv
B,6
B,7
B,8
B,9
$ more ./Desktop/folder/sub-folder2/sample.csv
C,10
C,11
C,12
C,13
C,14
C,15
C,16
$ more ./Desktop/folder/sub-folder3/sub-sub-folder/sample.csv
D,17
D,18
D,19
$ python3 aggregate_samples.py ./Desktop
./Desktop/folder/sub-folder/sample.csv
./Desktop/folder/sub-folder/sub-sub-folder/sample.csv
./Desktop/folder/sub-folder2/sample.csv
./Desktop/folder/sub-folder3/sub-sub-folder/sample.csv
$ cat aggregate.csv
X,1,6,10,17
X,2,7,11,18
X,3,8,12,19
X,4,9,13,
X,5,,14,
X,,,15,
,,,16,
這是完成此操作的代碼。您需要的關鍵技術:os.walk()遞歸搜索文件夾、csv讀取文件的模塊sample.csv(并獲取第二列)、累積樣本的列表以及csv再次寫出結果。我假設您的sample.csv文件將具有不同的長度,因此代碼會處理它(通過預先分配一個稀疏矩陣)。
這假設您的數據集足夠小以適合內存。如果沒有,那么需要做更多的工作。
# aggregate_samples.py
import os
import sys
import argparse
import csv
def main(options):
columns = []
try:
# Load in aggregate.csv, if there is one.
with open('aggregate.csv') as f:
column = [line.rstrip('\n') for line in f]
columns.append(column)
except FileNotFoundError:
# Doesn't exist; create it later.
pass
longest_sample = 0
for d, subdirs, files in os.walk(options.directory):
subdirs.sort()
for filename in files:
if filename == 'sample.csv':
file_path = os.path.join(d, filename)
print(file_path)
samples = []
with open(file_path) as f:
reader = csv.reader(f, delimiter=',')
# Get the 2nd column.
for sample in reader:
samples.append(sample[1])
longest_sample = max(longest_sample, len(samples))
columns.append(samples)
# Pre-fill a transpose matrix according to number of columns
# and longest colum.
a = [ [ '' for i in columns ] for j in range(longest_sample) ]
# Move samples into matrix, transposing as you go.
for i in range(len(columns)):
for j in range(len(columns[i])):
a[j][i] = columns[i][j]
# Output matrix as CSV.
with open('aggregate.csv', 'w+') as aggregate:
writer = csv.writer(aggregate, delimiter=',')
writer.writerows(a)
return 0
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'directory',
help='Directory path.')
options = parser.parse_args()
sys.exit(main(options))
添加回答
舉報