@@ -27,15 +27,19 @@ public class Deduplicator
27
27
{
28
28
private List <String > warcFiles = new ArrayList ();
29
29
private List <String > warcRecords = new ArrayList ();
30
- private RandomAccessFile digests ;
31
- // Default length (8k) that will be copied (written) to dedup_warc
32
- private int bufferSize = 8129 ;
30
+ private RandomAccessFile digests ;
31
+ private int bufferSize ;
32
+ private String currentWarc ; // represents absolute path of the current warc which is under processing
33
33
34
- public Deduplicator () { }
34
+ public Deduplicator ()
35
+ {
36
+ // Default length (8k) that will be copied (written) to dedup_warc
37
+ bufferSize = 8129 ;
38
+ }
35
39
36
40
public Deduplicator ( int size )
37
41
{
38
- bufferSize = size ;
42
+ setBufferSize ( size ) ;
39
43
}
40
44
41
45
public void deduplicate ( String digestsPath , String rootDirPath )
@@ -49,7 +53,8 @@ public void deduplicate( String digestsPath, String rootDirPath )
49
53
findAllWarcsRecursively ( rootDir );
50
54
51
55
for ( String warc : warcFiles )
52
- {
56
+ {
57
+ currentWarc = warc ;
53
58
writeWarcDedup ( warc );
54
59
printWarcRecords ( warcRecords );
55
60
}
@@ -87,9 +92,6 @@ private void writeWarcDedup( String warcAbsolutePath ) throws IOException
87
92
FileOutputStream warcOutputStream = new FileOutputStream (
88
93
warcDedupAbsolutePath );
89
94
90
- // Delete the following line after testing
91
- //FileOutputStream warcOutputStream2 = new FileOutputStream( "/home/msm/warcrefs/warc.gz" ); // "/home/msm/warcrefs/warc.warc.gz"
92
-
93
95
int preOffset = 0 ;
94
96
int preLength = 0 ;
95
97
int offset = 0 ;
@@ -164,9 +166,6 @@ private void writeWarcDedup( String warcAbsolutePath ) throws IOException
164
166
warcInputStream .close ();
165
167
//warcRevisitInputStream.close();
166
168
warcOutputStream .close ();
167
-
168
- // Delete the following line after testing
169
- //warcOutputStream2.close();
170
169
}
171
170
catch ( Exception e )
172
171
{
@@ -182,7 +181,7 @@ private void writeRevisitRecord( FileInputStream fis, FileOutputStream fos,
182
181
WarcReader wrc = new WarcReaderCompressed ();
183
182
184
183
// TODO: should not create a new FileInputStream object when invoking writeRevisitRecord method
185
- fis = new FileInputStream ( "/home/msm/warcrefs/JAN25_00336-20110731050545553-00076-14438~ia714237.archive.bibalex.org~8443.warc.gz" );
184
+ fis = new FileInputStream ( currentWarc );
186
185
fis .skip ( offset );
187
186
WarcRecord record = wrc .getNextRecordFrom ( fis , offset );
188
187
0 commit comments