diff --git a/fixtures/bed12.bed b/fixtures/bed12.bed new file mode 100644 index 0000000..196f40f --- /dev/null +++ b/fixtures/bed12.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488 0,3512 +chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 2 433,399 0,3601 diff --git a/fixtures/bed12plus.bed b/fixtures/bed12plus.bed new file mode 100644 index 0000000..d03f2db --- /dev/null +++ b/fixtures/bed12plus.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488 0,3512 foo +chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 2 433,399 0,3601 bar diff --git a/fixtures/bed3.bed b/fixtures/bed3.bed new file mode 100644 index 0000000..9a6a504 --- /dev/null +++ b/fixtures/bed3.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 +chr22 2000 6000 diff --git a/fixtures/bed4.bed b/fixtures/bed4.bed new file mode 100644 index 0000000..8084acf --- /dev/null +++ b/fixtures/bed4.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA +chr22 2000 6000 cloneB diff --git a/fixtures/bed5.bed b/fixtures/bed5.bed new file mode 100644 index 0000000..d9e7b4d --- /dev/null +++ b/fixtures/bed5.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA 960 +chr22 2000 6000 cloneB 900 diff --git a/fixtures/bed6.bed b/fixtures/bed6.bed new file mode 100644 index 0000000..1326f7e --- /dev/null +++ b/fixtures/bed6.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA 960 + +chr22 2000 6000 cloneB 900 - diff --git a/fixtures/bed7.bed b/fixtures/bed7.bed new file mode 100644 index 0000000..31be809 --- /dev/null +++ b/fixtures/bed7.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA 960 + 1000 +chr22 2000 6000 cloneB 900 - 2000 diff --git a/fixtures/bed8.bed b/fixtures/bed8.bed new file mode 100644 index 0000000..9e47486 --- /dev/null +++ b/fixtures/bed8.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA 960 + 1000 5000 +chr22 2000 6000 cloneB 900 - 2000 6000 diff --git a/fixtures/bed9.bed b/fixtures/bed9.bed new file mode 100644 index 0000000..6a6d134 --- /dev/null +++ b/fixtures/bed9.bed @@ -0,0 +1,2 @@ +chr22 1000 5000 cloneA 960 + 1000 5000 0 +chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 diff --git a/oxbow/Cargo.lock b/oxbow/Cargo.lock index c85f225..2c21770 100644 --- a/oxbow/Cargo.lock +++ b/oxbow/Cargo.lock @@ -742,6 +742,7 @@ checksum = "de44e418359f87564942f592782671c69de012c2237d247052d982c9e0af2ef6" dependencies = [ "noodles-bam", "noodles-bcf", + "noodles-bed", "noodles-bgzf", "noodles-core", "noodles-cram", @@ -784,6 +785,15 @@ dependencies = [ "noodles-vcf", ] +[[package]] +name = "noodles-bed" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deba180b7b94c524307da91d5bc983e0ccb9a08c4e24384cc8f93e0210af254a" +dependencies = [ + "noodles-core", +] + [[package]] name = "noodles-bgzf" version = "0.25.0" diff --git a/oxbow/Cargo.toml b/oxbow/Cargo.toml index 23e96d3..be26322 100644 --- a/oxbow/Cargo.toml +++ b/oxbow/Cargo.toml @@ -9,5 +9,5 @@ description = "Read specialized bioinformatic file formats as data frames in R, [dependencies] arrow = "48.0.0" byteorder = "1.5.0" -noodles = { version = "0.55.0", features = ["bam", "bcf", "bgzf", "core", "cram", "fasta", "fastq", "gff", "gtf", "sam", "csi", "vcf", "tabix"] } +noodles = { version = "0.55.0", features = ["bam", "bcf", "bed", "bgzf", "core", "cram", "fasta", "fastq", "gff", "gtf", "sam", "csi", "vcf", "tabix"] } bigtools = { version = "0.3.0", default-features = false, features = ["read"] } diff --git a/oxbow/src/bed.rs b/oxbow/src/bed.rs new file mode 100644 index 0000000..de47884 --- /dev/null +++ b/oxbow/src/bed.rs @@ -0,0 +1,291 @@ +use arrow::{ + array::{ + ArrayRef, GenericStringBuilder, ListBuilder, StructBuilder, UInt16Builder, UInt64Builder, + UInt8Builder, + }, + datatypes::{DataType, Field}, + error::ArrowError, + record_batch::RecordBatch, +}; +use noodles::bed; +use std::{ + fs::File, + io::{self, BufRead, BufReader}, + str, + sync::Arc, +}; + +use crate::batch_builder::{write_ipc, BatchBuilder}; + +pub fn new_from_path(path: &str) -> io::Result>> { + File::open(path).map(BufReader::new).map(bed::Reader::new) +} + +pub fn new_from_reader(bed: R) -> io::Result>> +where + R: io::Read, +{ + Ok(bed::Reader::new(BufReader::new(bed))) +} + +/// Returns the records in the given region as Apache Arrow IPC. +/// +/// If the region is `None`, all records are returned. +/// +/// # Examples +/// +/// ```no_run +/// use oxbow::bed; +/// +/// let mut reader = bed::new_from_path("sample.bed").unwrap(); +/// let ipc = bed::records_to_ipc(reader).unwrap(); +/// ``` +pub fn records_to_ipc(mut reader: bed::Reader) -> Result, ArrowError> +where + T: BufRead, +{ + let batch_builder = BedBatchBuilder::new(1024)?; + let records = reader.records().map(|r| r.unwrap()); + write_ipc(records, batch_builder) +} + +struct BedBatchBuilder { + reference_sequence_name: GenericStringBuilder, + start_position: UInt64Builder, + end_position: UInt64Builder, + name: GenericStringBuilder, + score: UInt16Builder, + strand: GenericStringBuilder, + thick_start: UInt64Builder, + thick_end: UInt64Builder, + color: StructBuilder, + block_sizes: ListBuilder, + block_starts: ListBuilder, +} + +impl BedBatchBuilder { + pub fn new(_capacity: usize) -> Result { + Ok(Self { + reference_sequence_name: GenericStringBuilder::::new(), + start_position: UInt64Builder::new(), + end_position: UInt64Builder::new(), + name: GenericStringBuilder::::new(), + score: UInt16Builder::new(), + strand: GenericStringBuilder::::new(), + thick_start: UInt64Builder::new(), + thick_end: UInt64Builder::new(), + color: StructBuilder::new( + vec![ + Field::new("r", DataType::UInt8, true), + Field::new("g", DataType::UInt8, true), + Field::new("b", DataType::UInt8, true), + ], + vec![ + Box::new(UInt8Builder::new()), + Box::new(UInt8Builder::new()), + Box::new(UInt8Builder::new()), + ], + ), + block_sizes: ListBuilder::new(UInt64Builder::new()), + block_starts: ListBuilder::new(UInt64Builder::new()), + }) + } +} + +impl BatchBuilder for BedBatchBuilder { + // Noodles implements bed Record structs for bed3-bed9 and bed12. + // The following only handles bed12. + type Record<'a> = &'a bed::Record<12>; + + fn push(&mut self, record: Self::Record<'_>) { + // Mandatory fields (spec) + self.reference_sequence_name + .append_value(record.reference_sequence_name()); + self.start_position + .append_value(record.start_position().get().try_into().unwrap()); + self.end_position + .append_value(record.end_position().get().try_into().unwrap()); + + // Optional fields (spec) + if let Some(name) = record.name() { + self.name.append_value(name.to_string()); + } else { + self.name.append_null(); + } + + if let Some(score) = record.score() { + self.score.append_value(score.get().try_into().unwrap()); + } else { + // A score of 0 will be handled as a null as well + self.score.append_null(); + } + + if let Some(strand) = record.strand() { + self.strand.append_value(strand.to_string()); + } else { + self.strand.append_null(); + } + + // thick_start and thick_end are required fields in Noodle's bed7/8 structs. + self.thick_start + .append_value(record.thick_start().get().try_into().unwrap()); + + self.thick_end + .append_value(record.thick_end().get().try_into().unwrap()); + + if let Some(color) = record.color() { + let colors = [color.red(), color.green(), color.blue()]; + for (i, val) in colors.iter().enumerate() { + self.color + .field_builder::(i) + .unwrap() + .append_value(*val); + } + + // Handle 1 or 2 missing colors + if colors.len() == 2 { + self.color + .field_builder::(2) + .unwrap() + .append_null(); + } else if colors.len() == 1 { + self.color + .field_builder::(1) + .unwrap() + .append_null(); + self.color + .field_builder::(2) + .unwrap() + .append_null(); + } + self.color.append(true); + } else { + for i in 0..3 { + self.color + .field_builder::(i) + .unwrap() + .append_null(); + } + self.color.append(true); + } + + let (block_sizes, block_starts): (Vec<_>, Vec<_>) = record.blocks().iter().cloned().unzip(); + let block_sizes_option_u64: Vec> = block_sizes + .into_iter() + .map(|usize_value| Some(usize_value as u64)) + .collect(); + let block_starts_option_u64: Vec> = block_starts + .into_iter() + .map(|usize_value| Some(usize_value as u64)) + .collect(); + self.block_starts.append_value(block_sizes_option_u64); + self.block_sizes.append_value(block_starts_option_u64); + } + + fn finish(mut self) -> Result { + RecordBatch::try_from_iter(vec![ + ( + "reference_sequence_name", + Arc::new(self.reference_sequence_name.finish()) as ArrayRef, + ), + ( + "start_position", + Arc::new(self.start_position.finish()) as ArrayRef, + ), + ( + "end_position", + Arc::new(self.end_position.finish()) as ArrayRef, + ), + ("name", Arc::new(self.name.finish()) as ArrayRef), + ("score", Arc::new(self.score.finish()) as ArrayRef), + ("strand", Arc::new(self.strand.finish()) as ArrayRef), + ( + "thick_start", + Arc::new(self.thick_start.finish()) as ArrayRef, + ), + ("thick_end", Arc::new(self.thick_end.finish()) as ArrayRef), + ("color", Arc::new(self.color.finish()) as ArrayRef), + ( + "block_sizes", + Arc::new(self.block_sizes.finish()) as ArrayRef, + ), + ( + "block_starts", + Arc::new(self.block_starts.finish()) as ArrayRef, + ), + ]) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::ipc::reader::FileReader; + use arrow::record_batch::RecordBatch; + + fn read_record_batch(fixture_path: &str) -> RecordBatch { + let mut dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")); + dir.push(fixture_path); + let reader = new_from_path(dir.to_str().unwrap()).unwrap(); + let ipc = records_to_ipc(reader).unwrap(); + let cursor = std::io::Cursor::new(ipc); + let mut arrow_reader = FileReader::try_new(cursor, None).unwrap(); + // make sure we have one batch + assert_eq!(arrow_reader.num_batches(), 1); + arrow_reader.next().unwrap().unwrap() + } + + //#[test] + //fn test_read_all_bed3() { + // let record_batch = read_record_batch("../fixtures/bed3.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} + + //#[test] + //fn test_read_all_bed4() { + // let record_batch = read_record_batch("../fixtures/bed4.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} + + //#[test] + //fn test_read_all_bed5() { + // let record_batch = read_record_batch("../fixtures/bed5.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} + + //#[test] + //fn test_read_all_bed6() { + // let record_batch = read_record_batch("../fixtures/bed6.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} + + //#[test] + //fn test_read_all_bed7() { + // let record_batch = read_record_batch("../fixtures/bed7.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} + + //#[test] + //fn test_read_all_bed8() { + // let record_batch = read_record_batch("../fixtures/bed8.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} + + //#[test] + //fn test_read_all_bed9() { + // let record_batch = read_record_batch("../fixtures/bed9.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} + + #[test] + fn test_read_all_bed12() { + let record_batch = read_record_batch("../fixtures/bed12.bed"); + assert_eq!(record_batch.num_rows(), 2); + } + + //#[test] + //fn test_read_all_bed12plus() { + // let record_batch = read_record_batch("../fixtures/bed12plus.bed"); + // assert_eq!(record_batch.num_rows(), 2); + //} +} diff --git a/oxbow/src/lib.rs b/oxbow/src/lib.rs index 8a572c7..df16fa6 100644 --- a/oxbow/src/lib.rs +++ b/oxbow/src/lib.rs @@ -25,6 +25,7 @@ pub mod bam; mod batch_builder; +pub mod bed; pub mod fasta; pub mod fastq; pub mod vpos; diff --git a/py-oxbow/Cargo.lock b/py-oxbow/Cargo.lock index 19ca9ce..72a97fb 100644 --- a/py-oxbow/Cargo.lock +++ b/py-oxbow/Cargo.lock @@ -773,6 +773,7 @@ checksum = "de44e418359f87564942f592782671c69de012c2237d247052d982c9e0af2ef6" dependencies = [ "noodles-bam", "noodles-bcf", + "noodles-bed", "noodles-bgzf", "noodles-core", "noodles-cram", @@ -815,6 +816,15 @@ dependencies = [ "noodles-vcf", ] +[[package]] +name = "noodles-bed" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deba180b7b94c524307da91d5bc983e0ccb9a08c4e24384cc8f93e0210af254a" +dependencies = [ + "noodles-core", +] + [[package]] name = "noodles-bgzf" version = "0.25.0" diff --git a/py-oxbow/src/lib.rs b/py-oxbow/src/lib.rs index f000b18..d1f9ae3 100644 --- a/py-oxbow/src/lib.rs +++ b/py-oxbow/src/lib.rs @@ -8,6 +8,7 @@ use pyo3::types::PyString; use oxbow::bam; use oxbow::bam::BamReader; use oxbow::bcf; +use oxbow::bed; use oxbow::bigbed::BigBedReader; use oxbow::bigwig::BigWigReader; use oxbow::fasta::FastaReader; @@ -104,6 +105,25 @@ fn read_bam_vpos( } } +#[pyfunction] +fn read_bed(py: Python, path_or_file_like: PyObject) -> PyObject { + if let Ok(string_ref) = path_or_file_like.downcast::(py) { + // If it's a string, treat it as a path + let reader = bed::new_from_path(string_ref.to_str().unwrap()).unwrap(); + let ipc = bed::records_to_ipc(reader).unwrap(); + Python::with_gil(|py| PyBytes::new(py, &ipc).into()) + } else { + // Otherwise, treat it as file-like + let file_like = match PyFileLikeObject::new(path_or_file_like, true, false, true) { + Ok(file_like) => file_like, + Err(_) => panic!("Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object."), + }; + let reader = bed::new_from_reader(file_like).unwrap(); + let ipc = bed::records_to_ipc(reader).unwrap(); + Python::with_gil(|py| PyBytes::new(py, &ipc).into()) + } +} + #[pyfunction] fn read_vcf( py: Python, @@ -337,6 +357,7 @@ fn py_oxbow(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(partition_from_index_file, m)?)?; m.add_function(wrap_pyfunction!(read_bam, m)?)?; m.add_function(wrap_pyfunction!(read_bam_vpos, m)?)?; + m.add_function(wrap_pyfunction!(read_bed, m)?)?; // m.add_function(wrap_pyfunction!(read_cram, m)?)?; // m.add_function(wrap_pyfunction!(read_cram_vpos, m)?)?; m.add_function(wrap_pyfunction!(read_vcf, m)?)?;