2 回答

TA貢獻1860條經驗 獲得超8個贊
我的總體方法是合理的,但由于缺乏從 Java 的 ZipFile 返回的詳細信息而受到阻礙。例如,有時在下一個本地標頭開始之前,壓縮數據的末尾有一個額外的 16 個字節。ZipFile 中沒有任何內容可以幫助解決此問題。
zip4j 似乎是一個更好的選擇,并提供了以下方法: header.getOffsetLocalHeader()
消除了一些容易出錯的計算。

TA貢獻1785條經驗 獲得超8個贊
我還可以使用 zip4j 通過以下代碼使其工作。
但是我仍然不明白通過等式的解碼部分: long endFile = 30 + offset + header.getFileNameLength() + compressedSize - 1;。30從哪里來?我如何確保該方程包含所有用例的所有必要變量?
public static void main(String[] args) throws Exception {
S3Client s3Client = S3Client.builder()
.credentialsProvider(StaticCredentialsProvider
.create(AwsSessionCredentials.create(ACCESS_KEY, SECRET_KEY, SESSION_TOKEN)))
.region(Region.US_WEST_2)
.build();
HeadObjectResponse headObject = s3Client.headObject(HeadObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.build());
long zipSize = headObject.contentLength();
// fetch the last 22 bytes (end-of-central-directory record; assuming the comment field is empty)
long eocdStart = zipSize - 22;
final var eocdStream = s3Client.getObject(GetObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.range("bytes=%d-%d".formatted(eocdStart, zipSize))
.build());
System.out.println("eocd start: " + eocdStart);
byte[] eocd = IOUtils.toByteArray(eocdStream);
// get the start offset and size of the central directory
int cdSize = byteArrayToLeInt(Arrays.copyOfRange(eocd, 12, 16));
int cdStart = byteArrayToLeInt(Arrays.copyOfRange(eocd, 16, 20));
System.out.println("cdStart: " + cdStart);
System.out.println("cdSize: " + cdSize);
// get the full central directory
final var cdStream = s3Client.getObject(GetObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.range("bytes=%d-%d".formatted(cdStart, cdStart + cdSize - 1))
.build());
byte[] cd = IOUtils.toByteArray(cdStream);
// write the full dir + eocd:
ByteArrayOutputStream out = new ByteArrayOutputStream();
// write cd
out.write(cd);
// write eocd, resetting the cd start to 0 since that is
// where it will appear in our new temp file
byte[] b = leIntToByteArray(0);
eocd[16] = b[0];
eocd[17] = b[1];
eocd[18] = b[2];
eocd[19] = b[3];
out.write(eocd);
out.flush();
byte[] cdbytes = out.toByteArray();
System.out.println(cdbytes.length);
File tempFile = Files.createTempFile("temp", "zip").toFile();
FileOutputStream output = new FileOutputStream(tempFile);
output.write(cdbytes);
output.flush();
output.close();
getZipFile1(s3Client, tempFile, "a2ed09e5-dfdb-4a66-95f5-8bb62bc8fafd-2023-05-23T10_07_19Z.warc.gz");
getZipFile1(s3Client, tempFile, "index.cdx.gz");
getZipFile1(s3Client, tempFile, "index.cdx");
getZipFile1(s3Client, tempFile, "extraPages.jsonl");
getZipFile1(s3Client, tempFile, "pages.jsonl");
getZipFile1(s3Client, tempFile, "datapackage.json");
getZipFile1(s3Client, tempFile, "datapackage-digest.json");
}
private static void getZipFile1(S3Client s3Client, File tempFile, String file) throws Exception {
ZipFile zipFile = new ZipFile(tempFile);
for (var header : zipFile.getFileHeaders()) {
if (!header.isDirectory()) {
if (header.getFileName().contains(file)) {
System.out.println(header);
long offset = header.getOffsetLocalHeader(); // 41489906
int compressedSize = (int) header.getCompressedSize(); // 171
long endFile = 30 + offset + header.getFileNameLength() + compressedSize - 1;
byte[] fileBytes = IOUtils.toByteArray(s3Client.getObject(GetObjectRequest.builder()
.bucket(BUCKET)
.key(OBJECT_PATH)
.range("bytes=%d-%d".formatted(offset, endFile))
.build()));
ZipInputStream zipInputStream = new ZipInputStream(new ByteArrayInputStream(fileBytes));
zipInputStream.getNextEntry(header, true);
File outputFile = new File("/home/joao/Downloads/folder/" + header.getFileName());
Files.deleteIfExists(outputFile.toPath());
FileUtils.copyInputStreamToFile(zipInputStream, outputFile);
}
}
}
}
添加回答
舉報