From 4964d844313d5e62cf102616d26864dca6fe286e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:18:52 +0100 Subject: [PATCH] Add `ReaderBuilder::with_header` for csv reader (#4949) * Add ReaderBuilder::with_header * Update test --- arrow-csv/examples/csv_calculation.rs | 2 +- arrow-csv/src/reader/mod.rs | 48 ++++++++++++++++----------- arrow/benches/csv_reader.rs | 2 +- parquet/src/bin/parquet-fromcsv.rs | 6 ++-- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/arrow-csv/examples/csv_calculation.rs b/arrow-csv/examples/csv_calculation.rs index 12aaadde4415..6ce963e2b012 100644 --- a/arrow-csv/examples/csv_calculation.rs +++ b/arrow-csv/examples/csv_calculation.rs @@ -33,7 +33,7 @@ fn main() { Field::new("c4", DataType::Boolean, true), ]); let mut reader = ReaderBuilder::new(Arc::new(csv_schema)) - .has_header(true) + .with_header(true) .build(file) .unwrap(); diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 1106b16bc46f..a194b35ffa46 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -225,7 +225,7 @@ impl InferredDataType { /// The format specification for the CSV file #[derive(Debug, Clone, Default)] pub struct Format { - has_header: bool, + header: bool, delimiter: Option, escape: Option, quote: Option, @@ -235,7 +235,7 @@ pub struct Format { impl Format { pub fn with_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + self.header = has_header; self } @@ -280,7 +280,7 @@ impl Format { // get or create header names // when has_header is false, creates default column names with column_ prefix - let headers: Vec = if self.has_header { + let headers: Vec = if self.header { let headers = &csv_reader.headers().map_err(map_csv_error)?.clone(); headers.iter().map(|s| s.to_string()).collect() } else { @@ -331,7 +331,7 @@ impl Format { /// Build a [`csv::Reader`] for this [`Format`] fn build_reader(&self, reader: R) -> csv::Reader { let mut builder = csv::ReaderBuilder::new(); - builder.has_headers(self.has_header); + builder.has_headers(self.header); if let Some(c) = self.delimiter { builder.delimiter(c); @@ -403,7 +403,7 @@ pub fn infer_reader_schema( ) -> Result<(Schema, usize), ArrowError> { let format = Format { delimiter: Some(delimiter), - has_header, + header: has_header, ..Default::default() }; format.infer_schema(reader, max_read_records) @@ -425,7 +425,7 @@ pub fn infer_schema_from_files( let mut records_to_read = max_read_records.unwrap_or(usize::MAX); let format = Format { delimiter: Some(delimiter), - has_header, + header: has_header, ..Default::default() }; @@ -1095,8 +1095,16 @@ impl ReaderBuilder { } /// Set whether the CSV file has headers + #[deprecated(note = "Use with_header")] + #[doc(hidden)] pub fn has_header(mut self, has_header: bool) -> Self { - self.format.has_header = has_header; + self.format.header = has_header; + self + } + + /// Set whether the CSV file has a header + pub fn with_header(mut self, has_header: bool) -> Self { + self.format.header = has_header; self } @@ -1176,7 +1184,7 @@ impl ReaderBuilder { let delimiter = self.format.build_parser(); let record_decoder = RecordDecoder::new(delimiter, self.schema.fields().len()); - let header = self.format.has_header as usize; + let header = self.format.header as usize; let (start, end) = match self.bounds { Some((start, end)) => (start + header, end + header), @@ -1317,7 +1325,7 @@ mod tests { .chain(Cursor::new("\n".to_string())) .chain(file_without_headers); let mut csv = ReaderBuilder::new(Arc::new(schema)) - .has_header(true) + .with_header(true) .build(both_files) .unwrap(); let batch = csv.next().unwrap().unwrap(); @@ -1335,7 +1343,7 @@ mod tests { .unwrap(); file.rewind().unwrap(); - let builder = ReaderBuilder::new(Arc::new(schema)).has_header(true); + let builder = ReaderBuilder::new(Arc::new(schema)).with_header(true); let mut csv = builder.build(file).unwrap(); let expected_schema = Schema::new(vec![ @@ -1505,7 +1513,7 @@ mod tests { let file = File::open("test/data/null_test.csv").unwrap(); let mut csv = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .build(file) .unwrap(); @@ -1530,7 +1538,7 @@ mod tests { let file = File::open("test/data/init_null_test.csv").unwrap(); let mut csv = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .build(file) .unwrap(); @@ -1588,7 +1596,7 @@ mod tests { let null_regex = Regex::new("^nil$").unwrap(); let mut csv = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .with_null_regex(null_regex) .build(file) .unwrap(); @@ -1710,7 +1718,7 @@ mod tests { ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(true) + .with_header(true) .with_delimiter(b'|') .with_batch_size(512) .with_projection(vec![0, 1, 2, 3]); @@ -2037,7 +2045,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_quote(b'~'); // default is ", change to ~ let mut csv_text = Vec::new(); @@ -2069,7 +2077,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_escape(b'\\'); // default is None, change to \ let mut csv_text = Vec::new(); @@ -2101,7 +2109,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_terminator(b'\n'); // default is CRLF, change to LF let mut csv_text = Vec::new(); @@ -2143,7 +2151,7 @@ mod tests { ])); for (idx, (bounds, has_header, expected)) in tests.into_iter().enumerate() { - let mut reader = ReaderBuilder::new(schema.clone()).has_header(has_header); + let mut reader = ReaderBuilder::new(schema.clone()).with_header(has_header); if let Some((start, end)) = bounds { reader = reader.with_bounds(start, end); } @@ -2208,7 +2216,7 @@ mod tests { for capacity in [1, 3, 7, 100] { let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .has_header(has_header) + .with_header(has_header) .build(File::open(path).unwrap()) .unwrap(); @@ -2226,7 +2234,7 @@ mod tests { let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .has_header(has_header) + .with_header(has_header) .build_buffered(buffered) .unwrap(); diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index 4c3f663bf741..5a91dfe0a6ff 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -45,7 +45,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { let cursor = Cursor::new(buf.as_slice()); let reader = csv::ReaderBuilder::new(batch.schema()) .with_batch_size(batch_size) - .has_header(true) + .with_header(true) .build_buffered(cursor) .unwrap(); diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 548bbdbfb8f1..1f5d0a62bbfa 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -321,7 +321,7 @@ fn configure_reader_builder(args: &Args, arrow_schema: Arc) -> ReaderBui let mut builder = ReaderBuilder::new(arrow_schema) .with_batch_size(args.batch_size) - .has_header(args.has_header) + .with_header(args.has_header) .with_delimiter(args.get_delimiter()); builder = configure_reader( @@ -606,7 +606,7 @@ mod tests { let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{reader_builder:?}"); - assert_debug_text(&builder_debug, "has_header", "false"); + assert_debug_text(&builder_debug, "header", "false"); assert_debug_text(&builder_debug, "delimiter", "Some(44)"); assert_debug_text(&builder_debug, "quote", "Some(34)"); assert_debug_text(&builder_debug, "terminator", "None"); @@ -641,7 +641,7 @@ mod tests { ])); let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{reader_builder:?}"); - assert_debug_text(&builder_debug, "has_header", "true"); + assert_debug_text(&builder_debug, "header", "true"); assert_debug_text(&builder_debug, "delimiter", "Some(9)"); assert_debug_text(&builder_debug, "quote", "None"); assert_debug_text(&builder_debug, "terminator", "Some(10)");