Skip to content

Commit 7591d3d

Browse files
Fix issue #6: eliminate any hard-code in Deduplicator and Warcrefs.
1 parent ea82dff commit 7591d3d

File tree

2 files changed

+15
-16
lines changed

2 files changed

+15
-16
lines changed

src/main/java/org/bibalex/warcrefs/Deduplicator.java

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,19 @@ public class Deduplicator
2727
{
2828
private List<String> warcFiles = new ArrayList();
2929
private List<String> warcRecords = new ArrayList();
30-
private RandomAccessFile digests;
31-
// Default length (8k) that will be copied (written) to dedup_warc
32-
private int bufferSize = 8129;
30+
private RandomAccessFile digests;
31+
private int bufferSize;
32+
private String currentWarc; // represents absolute path of the current warc which is under processing
3333

34-
public Deduplicator() { }
34+
public Deduplicator()
35+
{
36+
// Default length (8k) that will be copied (written) to dedup_warc
37+
bufferSize = 8129;
38+
}
3539

3640
public Deduplicator( int size )
3741
{
38-
bufferSize = size;
42+
setBufferSize( size );
3943
}
4044

4145
public void deduplicate( String digestsPath, String rootDirPath )
@@ -49,7 +53,8 @@ public void deduplicate( String digestsPath, String rootDirPath )
4953
findAllWarcsRecursively( rootDir );
5054

5155
for ( String warc : warcFiles )
52-
{
56+
{
57+
currentWarc = warc;
5358
writeWarcDedup( warc );
5459
printWarcRecords( warcRecords );
5560
}
@@ -87,9 +92,6 @@ private void writeWarcDedup( String warcAbsolutePath ) throws IOException
8792
FileOutputStream warcOutputStream = new FileOutputStream(
8893
warcDedupAbsolutePath );
8994

90-
// Delete the following line after testing
91-
//FileOutputStream warcOutputStream2 = new FileOutputStream( "/home/msm/warcrefs/warc.gz" ); // "/home/msm/warcrefs/warc.warc.gz"
92-
9395
int preOffset = 0;
9496
int preLength = 0;
9597
int offset = 0;
@@ -164,9 +166,6 @@ private void writeWarcDedup( String warcAbsolutePath ) throws IOException
164166
warcInputStream.close();
165167
//warcRevisitInputStream.close();
166168
warcOutputStream.close();
167-
168-
// Delete the following line after testing
169-
//warcOutputStream2.close();
170169
}
171170
catch ( Exception e )
172171
{
@@ -182,7 +181,7 @@ private void writeRevisitRecord( FileInputStream fis, FileOutputStream fos,
182181
WarcReader wrc = new WarcReaderCompressed();
183182

184183
// TODO: should not create a new FileInputStream object when invoking writeRevisitRecord method
185-
fis = new FileInputStream( "/home/msm/warcrefs/JAN25_00336-20110731050545553-00076-14438~ia714237.archive.bibalex.org~8443.warc.gz" );
184+
fis = new FileInputStream( currentWarc );
186185
fis.skip( offset );
187186
WarcRecord record = wrc.getNextRecordFrom( fis, offset );
188187

src/main/java/org/bibalex/warcrefs/Warcrefs.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ public class Warcrefs
1111
{
1212
public static void main( String[] args ) throws IOException
1313
{
14-
int bufferSize = 8129; // args[0]
15-
String digestsPath = "/home/msm/warcrefs/digests"; // args[1]
16-
String rootDirPath = "/home/msm/warcrefs"; // args[2]
14+
int bufferSize = Integer.parseInt( args[ 0 ] ); // 8129
15+
String digestsPath = args[ 1 ]; // path to digests file
16+
String rootDirPath = args[ 2 ]; // warcs directory
1717

1818
try
1919
{

0 commit comments

Comments
 (0)