From 6f7276561b7c8196ea8c0e6f972e0fade8b97028 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 19 Apr 2024 15:54:39 +0900 Subject: [PATCH 1/4] Bump version to 0.6.2-SNAPSHOT --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 1e50272..4eec4f1 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ 4.0.0 au.gov.nla httrack2warc - 0.6.1 + 0.6.2-SNAPSHOT UTF-8 From 04ca3e64240b6fced3ebc5f91e02158a1059bc2e Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2024 13:09:22 +0900 Subject: [PATCH 2/4] Write warcinfo records to the redirect WARC file too --- src/au/gov/nla/httrack2warc/Httrack2Warc.java | 4 ++++ src/au/gov/nla/httrack2warc/RedirectWriter.java | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/au/gov/nla/httrack2warc/Httrack2Warc.java b/src/au/gov/nla/httrack2warc/Httrack2Warc.java index 2e3a708..e1a0ae8 100644 --- a/src/au/gov/nla/httrack2warc/Httrack2Warc.java +++ b/src/au/gov/nla/httrack2warc/Httrack2Warc.java @@ -158,6 +158,10 @@ public void convertDirectory(Path sourceDirectory) throws IOException { Set processedFiles = new HashSet<>(); LinkRewriter linkRewriter = rewriteLinks ? new LinkRewriter(crawl) : null; + if (redirectWriter.warc != warc) { + redirectWriter.warc.writeWarcinfoRecord(UUID.randomUUID(), launchInstant, warcInfo); + } + crawl.forEach(record -> { if (isUrlExcluded(record.getUrl())) { log.info("Excluded {}", record.getUrl()); diff --git a/src/au/gov/nla/httrack2warc/RedirectWriter.java b/src/au/gov/nla/httrack2warc/RedirectWriter.java index aec54e6..76bed95 100644 --- a/src/au/gov/nla/httrack2warc/RedirectWriter.java +++ b/src/au/gov/nla/httrack2warc/RedirectWriter.java @@ -13,7 +13,7 @@ */ public class RedirectWriter implements Closeable { private final String prefix; - private final WarcWriter warc; + final WarcWriter warc; public RedirectWriter(String prefix, WarcWriter warc) { this.prefix = prefix; From 05c5f0ce1848b15650ec2825cd4e8482df6f8362 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2024 13:22:21 +0900 Subject: [PATCH 3/4] RedirectWriter: Fixup httrack URLs (to percent encode spaces etc) --- src/au/gov/nla/httrack2warc/RedirectWriter.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/au/gov/nla/httrack2warc/RedirectWriter.java b/src/au/gov/nla/httrack2warc/RedirectWriter.java index 76bed95..4caae03 100644 --- a/src/au/gov/nla/httrack2warc/RedirectWriter.java +++ b/src/au/gov/nla/httrack2warc/RedirectWriter.java @@ -1,5 +1,6 @@ package au.gov.nla.httrack2warc; +import au.gov.nla.httrack2warc.httrack.HtsUtil; import au.gov.nla.httrack2warc.httrack.HttrackRecord; import java.io.ByteArrayInputStream; @@ -23,7 +24,7 @@ public RedirectWriter(String prefix, WarcWriter warc) { public void write(HttrackRecord record, Instant warcDate) throws IOException { // build synthetic redirect record if (prefix != null && record.getFilename() != null) { - String httrackUrl = prefix + record.getFilename(); + String httrackUrl = HtsUtil.fixupUrl(prefix + record.getFilename()); byte[] body = new byte[0]; String header = "HTTP/1.1 301 Moved Permanently\r\n" + "Location: " + record.getUrl() + "\r\n" + From 27e634a13238c77af4d27f9fd852112cc0178638 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2024 13:31:36 +0900 Subject: [PATCH 4/4] Fix test for properly encoded redirect records --- test/au/gov/nla/httrack2warc/Httrack2WarcTest.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/au/gov/nla/httrack2warc/Httrack2WarcTest.java b/test/au/gov/nla/httrack2warc/Httrack2WarcTest.java index d7711f1..0fe0c7f 100644 --- a/test/au/gov/nla/httrack2warc/Httrack2WarcTest.java +++ b/test/au/gov/nla/httrack2warc/Httrack2WarcTest.java @@ -51,11 +51,11 @@ public void test() throws IOException { StringBuilder summary = new StringBuilder(); try (WarcReader warcReader = new WarcReader(outdir.resolve("crawl-0.warc.gz"))) { for (WarcRecord warcRecord: warcReader) { - URI url = warcRecord instanceof WarcTargetRecord ? ((WarcTargetRecord) warcRecord).targetURI() : null; + String url = warcRecord instanceof WarcTargetRecord ? ((WarcTargetRecord) warcRecord).target() : null; summary.append(warcRecord.type()).append(" ").append(url).append("\n"); // HTTrack generates bad http requests containing a fragment which jwarc strictly rejects, just skip them - if (url != null && url.getFragment() != null) continue; + if (url != null && URI.create(url).getFragment() != null) continue; if (warcRecord instanceof WarcRequest) { assertEquals(MessageVersion.HTTP_1_1, ((WarcRequest) warcRecord).http().version()); @@ -87,9 +87,9 @@ public void test() throws IOException { "request http://test.example.org/redirect\n" + "metadata http://test.example.org/redirect\n" + "response http://prefix.example.org/test.example.org/redirect\n" + - "response http://test.example.org/page%2520WITH%2520%2522special%2522%2520chars.html\n" + - "request http://test.example.org/page%2520WITH%2520%2522special%2522%2520chars.html\n" + - "metadata http://test.example.org/page%2520WITH%2520%2522special%2522%2520chars.html\n" + + "response http://test.example.org/page%20WITH%20%22special%22%20chars.html\n" + + "request http://test.example.org/page%20WITH%20%22special%22%20chars.html\n" + + "metadata http://test.example.org/page%20WITH%20%22special%22%20chars.html\n" + "response http://prefix.example.org/test.example.org/page%20WITH%20_special_%20chars.html\n" + "response http://test.example.org/image.gif\n" + "request http://test.example.org/image.gif\n" +