Skip to content

Commit

Permalink
Add bed12 reader
Browse files Browse the repository at this point in the history
  • Loading branch information
GarrettNg committed Nov 22, 2023
1 parent fb00333 commit 0e484b6
Show file tree
Hide file tree
Showing 15 changed files with 352 additions and 1 deletion.
2 changes: 2 additions & 0 deletions fixtures/bed12.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488 0,3512
chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 2 433,399 0,3601
2 changes: 2 additions & 0 deletions fixtures/bed12plus.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488 0,3512 foo
chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 2 433,399 0,3601 bar
2 changes: 2 additions & 0 deletions fixtures/bed3.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000
chr22 2000 6000
2 changes: 2 additions & 0 deletions fixtures/bed4.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA
chr22 2000 6000 cloneB
2 changes: 2 additions & 0 deletions fixtures/bed5.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA 960
chr22 2000 6000 cloneB 900
2 changes: 2 additions & 0 deletions fixtures/bed6.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA 960 +
chr22 2000 6000 cloneB 900 -
2 changes: 2 additions & 0 deletions fixtures/bed7.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA 960 + 1000
chr22 2000 6000 cloneB 900 - 2000
2 changes: 2 additions & 0 deletions fixtures/bed8.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA 960 + 1000 5000
chr22 2000 6000 cloneB 900 - 2000 6000
2 changes: 2 additions & 0 deletions fixtures/bed9.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr22 1000 5000 cloneA 960 + 1000 5000 0
chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255
10 changes: 10 additions & 0 deletions oxbow/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion oxbow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ description = "Read specialized bioinformatic file formats as data frames in R,
[dependencies]
arrow = "48.0.0"
byteorder = "1.5.0"
noodles = { version = "0.55.0", features = ["bam", "bcf", "bgzf", "core", "cram", "fasta", "fastq", "gff", "gtf", "sam", "csi", "vcf", "tabix"] }
noodles = { version = "0.55.0", features = ["bam", "bcf", "bed", "bgzf", "core", "cram", "fasta", "fastq", "gff", "gtf", "sam", "csi", "vcf", "tabix"] }
bigtools = { version = "0.3.0", default-features = false, features = ["read"] }
291 changes: 291 additions & 0 deletions oxbow/src/bed.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
use arrow::{
array::{
ArrayRef, GenericStringBuilder, ListBuilder, StructBuilder, UInt16Builder, UInt64Builder,
UInt8Builder,
},
datatypes::{DataType, Field},
error::ArrowError,
record_batch::RecordBatch,
};
use noodles::bed;
use std::{
fs::File,
io::{self, BufRead, BufReader},
str,
sync::Arc,
};

use crate::batch_builder::{write_ipc, BatchBuilder};

pub fn new_from_path(path: &str) -> io::Result<bed::Reader<BufReader<File>>> {
File::open(path).map(BufReader::new).map(bed::Reader::new)
}

pub fn new_from_reader<R>(bed: R) -> io::Result<bed::Reader<BufReader<R>>>
where
R: io::Read,
{
Ok(bed::Reader::new(BufReader::new(bed)))
}

/// Returns the records in the given region as Apache Arrow IPC.
///
/// If the region is `None`, all records are returned.
///
/// # Examples
///
/// ```no_run
/// use oxbow::bed;
///
/// let mut reader = bed::new_from_path("sample.bed").unwrap();
/// let ipc = bed::records_to_ipc(reader).unwrap();
/// ```
pub fn records_to_ipc<T>(mut reader: bed::Reader<T>) -> Result<Vec<u8>, ArrowError>
where
T: BufRead,
{
let batch_builder = BedBatchBuilder::new(1024)?;
let records = reader.records().map(|r| r.unwrap());
write_ipc(records, batch_builder)
}

struct BedBatchBuilder {
reference_sequence_name: GenericStringBuilder<i32>,
start_position: UInt64Builder,
end_position: UInt64Builder,
name: GenericStringBuilder<i32>,
score: UInt16Builder,
strand: GenericStringBuilder<i32>,
thick_start: UInt64Builder,
thick_end: UInt64Builder,
color: StructBuilder,
block_sizes: ListBuilder<UInt64Builder>,
block_starts: ListBuilder<UInt64Builder>,
}

impl BedBatchBuilder {
pub fn new(_capacity: usize) -> Result<Self, ArrowError> {
Ok(Self {
reference_sequence_name: GenericStringBuilder::<i32>::new(),
start_position: UInt64Builder::new(),
end_position: UInt64Builder::new(),
name: GenericStringBuilder::<i32>::new(),
score: UInt16Builder::new(),
strand: GenericStringBuilder::<i32>::new(),
thick_start: UInt64Builder::new(),
thick_end: UInt64Builder::new(),
color: StructBuilder::new(
vec![
Field::new("r", DataType::UInt8, true),
Field::new("g", DataType::UInt8, true),
Field::new("b", DataType::UInt8, true),
],
vec![
Box::new(UInt8Builder::new()),
Box::new(UInt8Builder::new()),
Box::new(UInt8Builder::new()),
],
),
block_sizes: ListBuilder::new(UInt64Builder::new()),
block_starts: ListBuilder::new(UInt64Builder::new()),
})
}
}

impl BatchBuilder for BedBatchBuilder {
// Noodles implements bed Record structs for bed3-bed9 and bed12.
// The following only handles bed12.
type Record<'a> = &'a bed::Record<12>;

fn push(&mut self, record: Self::Record<'_>) {
// Mandatory fields (spec)
self.reference_sequence_name
.append_value(record.reference_sequence_name());
self.start_position
.append_value(record.start_position().get().try_into().unwrap());
self.end_position
.append_value(record.end_position().get().try_into().unwrap());

// Optional fields (spec)
if let Some(name) = record.name() {
self.name.append_value(name.to_string());
} else {
self.name.append_null();
}

if let Some(score) = record.score() {
self.score.append_value(score.get().try_into().unwrap());
} else {
// A score of 0 will be handled as a null as well
self.score.append_null();
}

if let Some(strand) = record.strand() {
self.strand.append_value(strand.to_string());
} else {
self.strand.append_null();
}

// thick_start and thick_end are required fields in Noodle's bed7/8 structs.
self.thick_start
.append_value(record.thick_start().get().try_into().unwrap());

self.thick_end
.append_value(record.thick_end().get().try_into().unwrap());

if let Some(color) = record.color() {
let colors = [color.red(), color.green(), color.blue()];
for (i, val) in colors.iter().enumerate() {
self.color
.field_builder::<UInt8Builder>(i)
.unwrap()
.append_value(*val);
}

// Handle 1 or 2 missing colors
if colors.len() == 2 {
self.color
.field_builder::<UInt8Builder>(2)
.unwrap()
.append_null();
} else if colors.len() == 1 {
self.color
.field_builder::<UInt8Builder>(1)
.unwrap()
.append_null();
self.color
.field_builder::<UInt8Builder>(2)
.unwrap()
.append_null();
}
self.color.append(true);
} else {
for i in 0..3 {
self.color
.field_builder::<UInt8Builder>(i)
.unwrap()
.append_null();
}
self.color.append(true);
}

let (block_sizes, block_starts): (Vec<_>, Vec<_>) = record.blocks().iter().cloned().unzip();
let block_sizes_option_u64: Vec<Option<u64>> = block_sizes
.into_iter()
.map(|usize_value| Some(usize_value as u64))
.collect();
let block_starts_option_u64: Vec<Option<u64>> = block_starts
.into_iter()
.map(|usize_value| Some(usize_value as u64))
.collect();
self.block_starts.append_value(block_sizes_option_u64);
self.block_sizes.append_value(block_starts_option_u64);
}

fn finish(mut self) -> Result<RecordBatch, ArrowError> {
RecordBatch::try_from_iter(vec![
(
"reference_sequence_name",
Arc::new(self.reference_sequence_name.finish()) as ArrayRef,
),
(
"start_position",
Arc::new(self.start_position.finish()) as ArrayRef,
),
(
"end_position",
Arc::new(self.end_position.finish()) as ArrayRef,
),
("name", Arc::new(self.name.finish()) as ArrayRef),
("score", Arc::new(self.score.finish()) as ArrayRef),
("strand", Arc::new(self.strand.finish()) as ArrayRef),
(
"thick_start",
Arc::new(self.thick_start.finish()) as ArrayRef,
),
("thick_end", Arc::new(self.thick_end.finish()) as ArrayRef),
("color", Arc::new(self.color.finish()) as ArrayRef),
(
"block_sizes",
Arc::new(self.block_sizes.finish()) as ArrayRef,
),
(
"block_starts",
Arc::new(self.block_starts.finish()) as ArrayRef,
),
])
}
}

#[cfg(test)]
mod tests {
use super::*;
use arrow::ipc::reader::FileReader;
use arrow::record_batch::RecordBatch;

fn read_record_batch(fixture_path: &str) -> RecordBatch {
let mut dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
dir.push(fixture_path);
let reader = new_from_path(dir.to_str().unwrap()).unwrap();
let ipc = records_to_ipc(reader).unwrap();
let cursor = std::io::Cursor::new(ipc);
let mut arrow_reader = FileReader::try_new(cursor, None).unwrap();
// make sure we have one batch
assert_eq!(arrow_reader.num_batches(), 1);
arrow_reader.next().unwrap().unwrap()
}

//#[test]
//fn test_read_all_bed3() {
// let record_batch = read_record_batch("../fixtures/bed3.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}

//#[test]
//fn test_read_all_bed4() {
// let record_batch = read_record_batch("../fixtures/bed4.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}

//#[test]
//fn test_read_all_bed5() {
// let record_batch = read_record_batch("../fixtures/bed5.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}

//#[test]
//fn test_read_all_bed6() {
// let record_batch = read_record_batch("../fixtures/bed6.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}

//#[test]
//fn test_read_all_bed7() {
// let record_batch = read_record_batch("../fixtures/bed7.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}

//#[test]
//fn test_read_all_bed8() {
// let record_batch = read_record_batch("../fixtures/bed8.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}

//#[test]
//fn test_read_all_bed9() {
// let record_batch = read_record_batch("../fixtures/bed9.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}

#[test]
fn test_read_all_bed12() {
let record_batch = read_record_batch("../fixtures/bed12.bed");
assert_eq!(record_batch.num_rows(), 2);
}

//#[test]
//fn test_read_all_bed12plus() {
// let record_batch = read_record_batch("../fixtures/bed12plus.bed");
// assert_eq!(record_batch.num_rows(), 2);
//}
}
1 change: 1 addition & 0 deletions oxbow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
pub mod bam;
mod batch_builder;
pub mod bed;
pub mod fasta;
pub mod fastq;
pub mod vpos;
Expand Down
10 changes: 10 additions & 0 deletions py-oxbow/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 0e484b6

Please sign in to comment.