diff --git a/.gitignore b/.gitignore index 54f03288..14859f0a 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ src/main/python/tf/model.zip src/main/python/tf/util/spark.conf src/main/python/tf/model/graph/ src/main/python/tf/model/category/ +.bloop/ +.metals/ diff --git a/README.md b/README.md index ce0f940e..5a1b1965 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # The Archives Unleashed Toolkit [![codecov](https://codecov.io/gh/archivesunleashed/aut/branch/main/graph/badge.svg)](https://codecov.io/gh/archivesunleashed/aut) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut) -[![Javadoc](https://javadoc-badge.appspot.com/io.archivesunleashed/aut.svg?label=javadoc)](http://api.docs.archivesunleashed.io/0.80.0/apidocs/index.html) -[![Scaladoc](https://javadoc-badge.appspot.com/io.archivesunleashed/aut.svg?label=scaladoc)](http://api.docs.archivesunleashed.io/0.80.0/scaladocs/index.html) -[![UserDocs](https://img.shields.io/badge/UserDocs-0.80.0-brightgreen?style=flat)](https://aut.docs.archivesunleashed.org/docs/home) +[![Javadoc](https://javadoc-badge.appspot.com/io.archivesunleashed/aut.svg?label=javadoc)](http://api.docs.archivesunleashed.io/0.90.0/apidocs/index.html) +[![Scaladoc](https://javadoc-badge.appspot.com/io.archivesunleashed/aut.svg?label=scaladoc)](http://api.docs.archivesunleashed.io/0.90.0/scaladocs/index.html) +[![UserDocs](https://img.shields.io/badge/UserDocs-0.90.0-brightgreen?style=flat)](https://aut.docs.archivesunleashed.org/docs/home) [![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat)](https://www.apache.org/licenses/LICENSE-2.0) [![Contribution Guidelines](http://img.shields.io/badge/CONTRIBUTING-Guidelines-blue.svg)](./CONTRIBUTING.md) diff --git a/pom.xml b/pom.xml index 2247e544..63aa7533 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ io.archivesunleashed aut jar - 0.90.0 + 0.90.1 Archives Unleashed Toolkit An open-source toolkit for analyzing web archives. https://github.com/archivesunleashed/aut @@ -58,7 +58,7 @@ scm:git:git@github.com:archivesunleashed/aut.git scm:git:git@github.com:archivesunleashed/aut.git git@github.com:archivesunleashed/aut.git - aut-0.90.0 + aut-0.90.1 @@ -469,7 +469,7 @@ org.jsoup jsoup - 1.7.3 + 1.13.1 org.netpreserve.commons diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala index c162b51a..2d269082 100644 --- a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala @@ -34,6 +34,7 @@ object AudioInformationExtractor { import spark.implicits._ // scalastyle:on d.select( + $"crawl_date", $"url", $"filename", $"extension", diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala index aa968459..d4ab80fa 100644 --- a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala @@ -34,6 +34,7 @@ object ImageInformationExtractor { import spark.implicits._ // scalastyle:on d.select( + $"crawl_date", $"url", $"filename", $"extension", diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala index ea51374b..2d105c95 100644 --- a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala @@ -34,6 +34,7 @@ object PDFInformationExtractor { import spark.implicits._ // scalastyle:on d.select( + $"crawl_date", $"url", $"filename", $"extension", diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala index 98594fd6..0db5868c 100644 --- a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala @@ -34,6 +34,7 @@ object PresentationProgramInformationExtractor { import spark.implicits._ // scalastyle:on d.select( + $"crawl_date", $"url", $"filename", $"extension", diff --git a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala index e303add0..1ca25ac7 100644 --- a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala @@ -34,6 +34,7 @@ object SpreadsheetInformationExtractor { import spark.implicits._ // scalastyle:on d.select( + $"crawl_date", $"url", $"filename", $"extension", diff --git a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala index 1cd3e392..f0839195 100644 --- a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala @@ -34,6 +34,7 @@ object VideoInformationExtractor { import spark.implicits._ // scalastyle:on d.select( + $"crawl_date", $"url", $"filename", $"extension", diff --git a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala index a314a7cb..a8424741 100644 --- a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala @@ -34,6 +34,7 @@ object WordProcessorInformationExtractor { import spark.implicits._ // scalastyle:on d.select( + $"crawl_date", $"url", $"filename", $"extension", diff --git a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala index b579b905..b7dd22bf 100644 --- a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala @@ -45,13 +45,14 @@ class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 1 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "https://ruebot.net/files/feniz.mp3") - assert(dfResults(0).get(1) == "feniz.mp3") - assert(dfResults(0).get(2) == "mp3") - assert(dfResults(0).get(3) == "audio/mpeg") + assert(dfResults(0).get(0) == "20190817") + assert(dfResults(0).get(1) == "https://ruebot.net/files/feniz.mp3") + assert(dfResults(0).get(2) == "feniz.mp3") + assert(dfResults(0).get(3) == "mp3") assert(dfResults(0).get(4) == "audio/mpeg") - assert(dfResults(0).get(5) == "f7e7ec84b12c294e19af1ba41732c733") - assert(dfResults(0).get(6) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a") + assert(dfResults(0).get(5) == "audio/mpeg") + assert(dfResults(0).get(6) == "f7e7ec84b12c294e19af1ba41732c733") + assert(dfResults(0).get(7) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a") } after { diff --git a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala index 829cbc5d..a414cf20 100644 --- a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala @@ -44,15 +44,16 @@ class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 55 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "http://www.archive.org/images/logoc.jpg") - assert(dfResults(0).get(1) == "logoc.jpg") - assert(dfResults(0).get(2) == "jpg") - assert(dfResults(0).get(3) == "image/jpeg") + assert(dfResults(0).get(0) == "20080430") + assert(dfResults(0).get(1) == "http://www.archive.org/images/logoc.jpg") + assert(dfResults(0).get(2) == "logoc.jpg") + assert(dfResults(0).get(3) == "jpg") assert(dfResults(0).get(4) == "image/jpeg") - assert(dfResults(0).get(5) == 70) - assert(dfResults(0).get(6) == 56) - assert(dfResults(0).get(7) == "8211d1fbb9b03d8522a1ae378f9d1b24") - assert(dfResults(0).get(8) == "a671e68fc211ee4996a91e99297f246b2c5faa1a") + assert(dfResults(0).get(5) == "image/jpeg") + assert(dfResults(0).get(6) == 70) + assert(dfResults(0).get(7) == 56) + assert(dfResults(0).get(8) == "8211d1fbb9b03d8522a1ae378f9d1b24") + assert(dfResults(0).get(9) == "a671e68fc211ee4996a91e99297f246b2c5faa1a") } after { diff --git a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala index ab88280c..3690c8a0 100644 --- a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala @@ -45,17 +45,18 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 2 assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "20190812") assert( dfResults(0).get( - 0 + 1 ) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" ) - assert(dfResults(0).get(1) == "cost-analysis.pdf") - assert(dfResults(0).get(2) == "pdf") - assert(dfResults(0).get(3) == "application/pdf") + assert(dfResults(0).get(2) == "cost-analysis.pdf") + assert(dfResults(0).get(3) == "pdf") assert(dfResults(0).get(4) == "application/pdf") - assert(dfResults(0).get(5) == "aaba59d2287afd40c996488a39bbc0dd") - assert(dfResults(0).get(6) == "569c28e0e8faa6945d6ca88fcd9e195825052c71") + assert(dfResults(0).get(5) == "application/pdf") + assert(dfResults(0).get(6) == "aaba59d2287afd40c996488a39bbc0dd") + assert(dfResults(0).get(7) == "569c28e0e8faa6945d6ca88fcd9e195825052c71") } after { diff --git a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala index 6bdfee35..5daab598 100644 --- a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala @@ -47,25 +47,26 @@ class PresentationProgramInformationExtractorTest val RESULTSLENGTH = 2 assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "20190815") assert( dfResults(0).get( - 0 + 1 ) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" ) - assert(dfResults(0).get(1) == "aut-test-fixtures.pptx") - assert(dfResults(0).get(2) == "pptx") + assert(dfResults(0).get(2) == "aut-test-fixtures.pptx") + assert(dfResults(0).get(3) == "pptx") assert( dfResults(0).get( - 3 + 4 ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation" ) assert( dfResults(0).get( - 4 + 5 ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation" ) - assert(dfResults(0).get(5) == "7a7b1fe4b6d311376eaced9de3b682ee") - assert(dfResults(0).get(6) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec") + assert(dfResults(0).get(6) == "7a7b1fe4b6d311376eaced9de3b682ee") + assert(dfResults(0).get(7) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec") } after { diff --git a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala index 79b8c781..50df3719 100644 --- a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala @@ -45,21 +45,22 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 4 assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "20190815") assert( dfResults(0).get( - 0 + 1 ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" ) - assert(dfResults(0).get(1) == "test-aut-fixture.ods") - assert(dfResults(0).get(2) == "ods") + assert(dfResults(0).get(2) == "test-aut-fixture.ods") + assert(dfResults(0).get(3) == "ods") assert( - dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet" + dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet" ) assert( - dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet" + dfResults(0).get(5) == "application/vnd.oasis.opendocument.spreadsheet" ) - assert(dfResults(0).get(5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9") - assert(dfResults(0).get(6) == "448c357e78317877a98a399448031a89f1dda6fb") + assert(dfResults(0).get(6) == "7f70280757d8beb2d1bfd6fb1b6ae6e9") + assert(dfResults(0).get(7) == "448c357e78317877a98a399448031a89f1dda6fb") } after { diff --git a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala index ac525428..6cb8ceb3 100644 --- a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala @@ -45,15 +45,16 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 1 assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "20190817") assert( - dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4" + dfResults(0).get(1) == "https://ruebot.net/2018-11-12%2016.14.11.mp4" ) - assert(dfResults(0).get(1) == "2018-11-12%2016.14.11.mp4") - assert(dfResults(0).get(2) == "mp4") - assert(dfResults(0).get(3) == "video/mp4") + assert(dfResults(0).get(2) == "2018-11-12%2016.14.11.mp4") + assert(dfResults(0).get(3) == "mp4") assert(dfResults(0).get(4) == "video/mp4") - assert(dfResults(0).get(5) == "2cde7de3213a87269957033f6315fce2") - assert(dfResults(0).get(6) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2") + assert(dfResults(0).get(5) == "video/mp4") + assert(dfResults(0).get(6) == "2cde7de3213a87269957033f6315fce2") + assert(dfResults(0).get(7) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2") } after { diff --git a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala index 8ea033d3..dca8350c 100644 --- a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala @@ -47,17 +47,18 @@ class WordProcessorInformationExtractorTest val RESULTSLENGTH = 3 assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "20190815") assert( dfResults(0).get( - 0 + 1 ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf" ) - assert(dfResults(0).get(1) == "test-aut-fixtures.rtf") - assert(dfResults(0).get(2) == "rtf") - assert(dfResults(0).get(3) == "application/rtf") + assert(dfResults(0).get(2) == "test-aut-fixtures.rtf") + assert(dfResults(0).get(3) == "rtf") assert(dfResults(0).get(4) == "application/rtf") - assert(dfResults(0).get(5) == "e483512b65ba44d71e843c57de2adeb7") - assert(dfResults(0).get(6) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb") + assert(dfResults(0).get(5) == "application/rtf") + assert(dfResults(0).get(6) == "e483512b65ba44d71e843c57de2adeb7") + assert(dfResults(0).get(7) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb") } after {