diff --git a/.gitignore b/.gitignore
index 54f03288..14859f0a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,5 @@ src/main/python/tf/model.zip
src/main/python/tf/util/spark.conf
src/main/python/tf/model/graph/
src/main/python/tf/model/category/
+.bloop/
+.metals/
diff --git a/README.md b/README.md
index ce0f940e..5a1b1965 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
# The Archives Unleashed Toolkit
[](https://codecov.io/gh/archivesunleashed/aut)
[](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut)
-[](http://api.docs.archivesunleashed.io/0.80.0/apidocs/index.html)
-[](http://api.docs.archivesunleashed.io/0.80.0/scaladocs/index.html)
-[](https://aut.docs.archivesunleashed.org/docs/home)
+[](http://api.docs.archivesunleashed.io/0.90.0/apidocs/index.html)
+[](http://api.docs.archivesunleashed.io/0.90.0/scaladocs/index.html)
+[](https://aut.docs.archivesunleashed.org/docs/home)
[](https://www.apache.org/licenses/LICENSE-2.0)
[](./CONTRIBUTING.md)
diff --git a/pom.xml b/pom.xml
index 2247e544..63aa7533 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
io.archivesunleashed
aut
jar
- 0.90.0
+ 0.90.1
Archives Unleashed Toolkit
An open-source toolkit for analyzing web archives.
https://github.com/archivesunleashed/aut
@@ -58,7 +58,7 @@
scm:git:git@github.com:archivesunleashed/aut.git
scm:git:git@github.com:archivesunleashed/aut.git
git@github.com:archivesunleashed/aut.git
- aut-0.90.0
+ aut-0.90.1
@@ -469,7 +469,7 @@
org.jsoup
jsoup
- 1.7.3
+ 1.13.1
org.netpreserve.commons
diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
index c162b51a..2d269082 100644
--- a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
@@ -34,6 +34,7 @@ object AudioInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
+ $"crawl_date",
$"url",
$"filename",
$"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
index aa968459..d4ab80fa 100644
--- a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
@@ -34,6 +34,7 @@ object ImageInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
+ $"crawl_date",
$"url",
$"filename",
$"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
index ea51374b..2d105c95 100644
--- a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
@@ -34,6 +34,7 @@ object PDFInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
+ $"crawl_date",
$"url",
$"filename",
$"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
index 98594fd6..0db5868c 100644
--- a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
@@ -34,6 +34,7 @@ object PresentationProgramInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
+ $"crawl_date",
$"url",
$"filename",
$"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
index e303add0..1ca25ac7 100644
--- a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
@@ -34,6 +34,7 @@ object SpreadsheetInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
+ $"crawl_date",
$"url",
$"filename",
$"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
index 1cd3e392..f0839195 100644
--- a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
@@ -34,6 +34,7 @@ object VideoInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
+ $"crawl_date",
$"url",
$"filename",
$"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
index a314a7cb..a8424741 100644
--- a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
@@ -34,6 +34,7 @@ object WordProcessorInformationExtractor {
import spark.implicits._
// scalastyle:on
d.select(
+ $"crawl_date",
$"url",
$"filename",
$"extension",
diff --git a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
index b579b905..b7dd22bf 100644
--- a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
@@ -45,13 +45,14 @@ class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1
assert(dfResults.length == RESULTSLENGTH)
- assert(dfResults(0).get(0) == "https://ruebot.net/files/feniz.mp3")
- assert(dfResults(0).get(1) == "feniz.mp3")
- assert(dfResults(0).get(2) == "mp3")
- assert(dfResults(0).get(3) == "audio/mpeg")
+ assert(dfResults(0).get(0) == "20190817")
+ assert(dfResults(0).get(1) == "https://ruebot.net/files/feniz.mp3")
+ assert(dfResults(0).get(2) == "feniz.mp3")
+ assert(dfResults(0).get(3) == "mp3")
assert(dfResults(0).get(4) == "audio/mpeg")
- assert(dfResults(0).get(5) == "f7e7ec84b12c294e19af1ba41732c733")
- assert(dfResults(0).get(6) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a")
+ assert(dfResults(0).get(5) == "audio/mpeg")
+ assert(dfResults(0).get(6) == "f7e7ec84b12c294e19af1ba41732c733")
+ assert(dfResults(0).get(7) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
index 829cbc5d..a414cf20 100644
--- a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
@@ -44,15 +44,16 @@ class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 55
assert(dfResults.length == RESULTSLENGTH)
- assert(dfResults(0).get(0) == "http://www.archive.org/images/logoc.jpg")
- assert(dfResults(0).get(1) == "logoc.jpg")
- assert(dfResults(0).get(2) == "jpg")
- assert(dfResults(0).get(3) == "image/jpeg")
+ assert(dfResults(0).get(0) == "20080430")
+ assert(dfResults(0).get(1) == "http://www.archive.org/images/logoc.jpg")
+ assert(dfResults(0).get(2) == "logoc.jpg")
+ assert(dfResults(0).get(3) == "jpg")
assert(dfResults(0).get(4) == "image/jpeg")
- assert(dfResults(0).get(5) == 70)
- assert(dfResults(0).get(6) == 56)
- assert(dfResults(0).get(7) == "8211d1fbb9b03d8522a1ae378f9d1b24")
- assert(dfResults(0).get(8) == "a671e68fc211ee4996a91e99297f246b2c5faa1a")
+ assert(dfResults(0).get(5) == "image/jpeg")
+ assert(dfResults(0).get(6) == 70)
+ assert(dfResults(0).get(7) == 56)
+ assert(dfResults(0).get(8) == "8211d1fbb9b03d8522a1ae378f9d1b24")
+ assert(dfResults(0).get(9) == "a671e68fc211ee4996a91e99297f246b2c5faa1a")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
index ab88280c..3690c8a0 100644
--- a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
@@ -45,17 +45,18 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 2
assert(dfResults.length == RESULTSLENGTH)
+ assert(dfResults(0).get(0) == "20190812")
assert(
dfResults(0).get(
- 0
+ 1
) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y"
)
- assert(dfResults(0).get(1) == "cost-analysis.pdf")
- assert(dfResults(0).get(2) == "pdf")
- assert(dfResults(0).get(3) == "application/pdf")
+ assert(dfResults(0).get(2) == "cost-analysis.pdf")
+ assert(dfResults(0).get(3) == "pdf")
assert(dfResults(0).get(4) == "application/pdf")
- assert(dfResults(0).get(5) == "aaba59d2287afd40c996488a39bbc0dd")
- assert(dfResults(0).get(6) == "569c28e0e8faa6945d6ca88fcd9e195825052c71")
+ assert(dfResults(0).get(5) == "application/pdf")
+ assert(dfResults(0).get(6) == "aaba59d2287afd40c996488a39bbc0dd")
+ assert(dfResults(0).get(7) == "569c28e0e8faa6945d6ca88fcd9e195825052c71")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
index 6bdfee35..5daab598 100644
--- a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
@@ -47,25 +47,26 @@ class PresentationProgramInformationExtractorTest
val RESULTSLENGTH = 2
assert(dfResults.length == RESULTSLENGTH)
+ assert(dfResults(0).get(0) == "20190815")
assert(
dfResults(0).get(
- 0
+ 1
) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx"
)
- assert(dfResults(0).get(1) == "aut-test-fixtures.pptx")
- assert(dfResults(0).get(2) == "pptx")
+ assert(dfResults(0).get(2) == "aut-test-fixtures.pptx")
+ assert(dfResults(0).get(3) == "pptx")
assert(
dfResults(0).get(
- 3
+ 4
) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
assert(
dfResults(0).get(
- 4
+ 5
) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
- assert(dfResults(0).get(5) == "7a7b1fe4b6d311376eaced9de3b682ee")
- assert(dfResults(0).get(6) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
+ assert(dfResults(0).get(6) == "7a7b1fe4b6d311376eaced9de3b682ee")
+ assert(dfResults(0).get(7) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
index 79b8c781..50df3719 100644
--- a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
@@ -45,21 +45,22 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 4
assert(dfResults.length == RESULTSLENGTH)
+ assert(dfResults(0).get(0) == "20190815")
assert(
dfResults(0).get(
- 0
+ 1
) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods"
)
- assert(dfResults(0).get(1) == "test-aut-fixture.ods")
- assert(dfResults(0).get(2) == "ods")
+ assert(dfResults(0).get(2) == "test-aut-fixture.ods")
+ assert(dfResults(0).get(3) == "ods")
assert(
- dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet"
+ dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet"
)
assert(
- dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet"
+ dfResults(0).get(5) == "application/vnd.oasis.opendocument.spreadsheet"
)
- assert(dfResults(0).get(5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
- assert(dfResults(0).get(6) == "448c357e78317877a98a399448031a89f1dda6fb")
+ assert(dfResults(0).get(6) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
+ assert(dfResults(0).get(7) == "448c357e78317877a98a399448031a89f1dda6fb")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
index ac525428..6cb8ceb3 100644
--- a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
@@ -45,15 +45,16 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1
assert(dfResults.length == RESULTSLENGTH)
+ assert(dfResults(0).get(0) == "20190817")
assert(
- dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
+ dfResults(0).get(1) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
)
- assert(dfResults(0).get(1) == "2018-11-12%2016.14.11.mp4")
- assert(dfResults(0).get(2) == "mp4")
- assert(dfResults(0).get(3) == "video/mp4")
+ assert(dfResults(0).get(2) == "2018-11-12%2016.14.11.mp4")
+ assert(dfResults(0).get(3) == "mp4")
assert(dfResults(0).get(4) == "video/mp4")
- assert(dfResults(0).get(5) == "2cde7de3213a87269957033f6315fce2")
- assert(dfResults(0).get(6) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2")
+ assert(dfResults(0).get(5) == "video/mp4")
+ assert(dfResults(0).get(6) == "2cde7de3213a87269957033f6315fce2")
+ assert(dfResults(0).get(7) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
index 8ea033d3..dca8350c 100644
--- a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
@@ -47,17 +47,18 @@ class WordProcessorInformationExtractorTest
val RESULTSLENGTH = 3
assert(dfResults.length == RESULTSLENGTH)
+ assert(dfResults(0).get(0) == "20190815")
assert(
dfResults(0).get(
- 0
+ 1
) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf"
)
- assert(dfResults(0).get(1) == "test-aut-fixtures.rtf")
- assert(dfResults(0).get(2) == "rtf")
- assert(dfResults(0).get(3) == "application/rtf")
+ assert(dfResults(0).get(2) == "test-aut-fixtures.rtf")
+ assert(dfResults(0).get(3) == "rtf")
assert(dfResults(0).get(4) == "application/rtf")
- assert(dfResults(0).get(5) == "e483512b65ba44d71e843c57de2adeb7")
- assert(dfResults(0).get(6) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb")
+ assert(dfResults(0).get(5) == "application/rtf")
+ assert(dfResults(0).get(6) == "e483512b65ba44d71e843c57de2adeb7")
+ assert(dfResults(0).get(7) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb")
}
after {