From 363135470f2ea68950117cdf2469ac961430baa0 Mon Sep 17 00:00:00 2001 From: Giovanni Del Monte Date: Tue, 18 Feb 2025 15:48:48 +0100 Subject: [PATCH] BUG: numpy.loadtxt reads only 50000 lines when skip_rows >= max_rows (#28319) * fixed bug in function _read in numpy/lib/_npyio_impl.py, misnamed variable skiplines as skiprows; added test in numpy/lib/tests/test_loadtxt.py * fixed sintax in test_loadtxt.py * changed use of mkstemp with use of tmpdir provided by pytest * fixed bug in use of tmpdir in loadtxt test * Update numpy/lib/tests/test_loadtxt.py Co-authored-by: Sebastian Berg * Update file numpy/lib/tests/test_loadtxt.py * Update file numpy/lib/tests/test_loadtxt.py * Update numpy/lib/tests/test_loadtxt.py --------- Co-authored-by: Sebastian Berg --- numpy/lib/_npyio_impl.py | 2 +- numpy/lib/tests/test_loadtxt.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/numpy/lib/_npyio_impl.py b/numpy/lib/_npyio_impl.py index f0d1bb2b0c68..4dc3a4b9b7e2 100644 --- a/numpy/lib/_npyio_impl.py +++ b/numpy/lib/_npyio_impl.py @@ -1084,7 +1084,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', # be adapted (in principle the concatenate could cast). chunks.append(next_arr.astype(read_dtype_via_object_chunks)) - skiprows = 0 # Only have to skip for first chunk + skiplines = 0 # Only have to skip for first chunk if max_rows >= 0: max_rows -= chunk_size if len(next_arr) < chunk_size: diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 116cd1608da3..60717be3bd9a 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -1073,3 +1073,28 @@ def test_maxrows_exceeding_chunksize(nmax): res = np.loadtxt(fname, dtype=str, delimiter=" ", max_rows=nmax) os.remove(fname) assert len(res) == nmax + +@pytest.mark.parametrize("nskip", (0, 10000, 12345, 50000, 67891, 100000)) +def test_skiprow_exceeding_maxrows_exceeding_chunksize(tmpdir, nskip): + # tries to read a file in chunks by skipping a variable amount of lines, + # less, equal, greater than max_rows + file_length = 110000 + data = "\n".join(f"{i} a 0.5 1" for i in range(1, file_length + 1)) + expected_length = min(60000, file_length - nskip) + expected = np.arange(nskip + 1, nskip + 1 + expected_length).astype(str) + + # file-like path + txt = StringIO(data) + res = np.loadtxt(txt, dtype='str', delimiter=" ", skiprows=nskip, max_rows=60000) + assert len(res) == expected_length + # are the right lines read in res? + assert_array_equal(expected, res[:, 0]) + + # file-obj path + tmp_file = tmpdir / "test_data.txt" + tmp_file.write(data) + fname = str(tmp_file) + res = np.loadtxt(fname, dtype='str', delimiter=" ", skiprows=nskip, max_rows=60000) + assert len(res) == expected_length + # are the right lines read in res? + assert_array_equal(expected, res[:, 0])