-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add bed12 reader #57
base: main
Are you sure you want to change the base?
Add bed12 reader #57
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488 0,3512 | ||
chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 2 433,399 0,3601 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488 0,3512 foo | ||
chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 2 433,399 0,3601 bar |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 | ||
chr22 2000 6000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA | ||
chr22 2000 6000 cloneB |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA 960 | ||
chr22 2000 6000 cloneB 900 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA 960 + | ||
chr22 2000 6000 cloneB 900 - |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA 960 + 1000 | ||
chr22 2000 6000 cloneB 900 - 2000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA 960 + 1000 5000 | ||
chr22 2000 6000 cloneB 900 - 2000 6000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
chr22 1000 5000 cloneA 960 + 1000 5000 0 | ||
chr22 2000 6000 cloneB 900 - 2000 6000 255,255,255 |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,291 @@ | ||
use arrow::{ | ||
array::{ | ||
ArrayRef, GenericStringBuilder, ListBuilder, StructBuilder, UInt16Builder, UInt64Builder, | ||
UInt8Builder, | ||
}, | ||
datatypes::{DataType, Field}, | ||
error::ArrowError, | ||
record_batch::RecordBatch, | ||
}; | ||
use noodles::bed; | ||
use std::{ | ||
fs::File, | ||
io::{self, BufRead, BufReader}, | ||
str, | ||
sync::Arc, | ||
}; | ||
|
||
use crate::batch_builder::{write_ipc, BatchBuilder}; | ||
|
||
pub fn new_from_path(path: &str) -> io::Result<bed::Reader<BufReader<File>>> { | ||
File::open(path).map(BufReader::new).map(bed::Reader::new) | ||
} | ||
|
||
pub fn new_from_reader<R>(bed: R) -> io::Result<bed::Reader<BufReader<R>>> | ||
where | ||
R: io::Read, | ||
{ | ||
Ok(bed::Reader::new(BufReader::new(bed))) | ||
} | ||
|
||
/// Returns the records in the given region as Apache Arrow IPC. | ||
/// | ||
/// If the region is `None`, all records are returned. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ```no_run | ||
/// use oxbow::bed; | ||
/// | ||
/// let mut reader = bed::new_from_path("sample.bed").unwrap(); | ||
/// let ipc = bed::records_to_ipc(reader).unwrap(); | ||
/// ``` | ||
pub fn records_to_ipc<T>(mut reader: bed::Reader<T>) -> Result<Vec<u8>, ArrowError> | ||
where | ||
T: BufRead, | ||
{ | ||
let batch_builder = BedBatchBuilder::new(1024)?; | ||
let records = reader.records().map(|r| r.unwrap()); | ||
write_ipc(records, batch_builder) | ||
} | ||
|
||
struct BedBatchBuilder { | ||
reference_sequence_name: GenericStringBuilder<i32>, | ||
start_position: UInt64Builder, | ||
end_position: UInt64Builder, | ||
name: GenericStringBuilder<i32>, | ||
score: UInt16Builder, | ||
strand: GenericStringBuilder<i32>, | ||
thick_start: UInt64Builder, | ||
thick_end: UInt64Builder, | ||
color: StructBuilder, | ||
block_sizes: ListBuilder<UInt64Builder>, | ||
block_starts: ListBuilder<UInt64Builder>, | ||
} | ||
|
||
impl BedBatchBuilder { | ||
pub fn new(_capacity: usize) -> Result<Self, ArrowError> { | ||
Ok(Self { | ||
reference_sequence_name: GenericStringBuilder::<i32>::new(), | ||
start_position: UInt64Builder::new(), | ||
end_position: UInt64Builder::new(), | ||
name: GenericStringBuilder::<i32>::new(), | ||
score: UInt16Builder::new(), | ||
strand: GenericStringBuilder::<i32>::new(), | ||
thick_start: UInt64Builder::new(), | ||
thick_end: UInt64Builder::new(), | ||
color: StructBuilder::new( | ||
vec![ | ||
Field::new("r", DataType::UInt8, true), | ||
Field::new("g", DataType::UInt8, true), | ||
Field::new("b", DataType::UInt8, true), | ||
], | ||
vec![ | ||
Box::new(UInt8Builder::new()), | ||
Box::new(UInt8Builder::new()), | ||
Box::new(UInt8Builder::new()), | ||
], | ||
), | ||
block_sizes: ListBuilder::new(UInt64Builder::new()), | ||
block_starts: ListBuilder::new(UInt64Builder::new()), | ||
}) | ||
} | ||
} | ||
|
||
impl BatchBuilder for BedBatchBuilder { | ||
// Noodles implements bed Record structs for bed3-bed9 and bed12. | ||
// The following only handles bed12. | ||
type Record<'a> = &'a bed::Record<12>; | ||
Comment on lines
+95
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may need to deviate from oxbow's |
||
|
||
fn push(&mut self, record: Self::Record<'_>) { | ||
// Mandatory fields (spec) | ||
self.reference_sequence_name | ||
.append_value(record.reference_sequence_name()); | ||
self.start_position | ||
.append_value(record.start_position().get().try_into().unwrap()); | ||
self.end_position | ||
.append_value(record.end_position().get().try_into().unwrap()); | ||
|
||
// Optional fields (spec) | ||
if let Some(name) = record.name() { | ||
self.name.append_value(name.to_string()); | ||
} else { | ||
self.name.append_null(); | ||
} | ||
|
||
if let Some(score) = record.score() { | ||
self.score.append_value(score.get().try_into().unwrap()); | ||
} else { | ||
// A score of 0 will be handled as a null as well | ||
self.score.append_null(); | ||
} | ||
|
||
if let Some(strand) = record.strand() { | ||
self.strand.append_value(strand.to_string()); | ||
} else { | ||
self.strand.append_null(); | ||
} | ||
|
||
// thick_start and thick_end are required fields in Noodle's bed7/8 structs. | ||
self.thick_start | ||
.append_value(record.thick_start().get().try_into().unwrap()); | ||
|
||
self.thick_end | ||
.append_value(record.thick_end().get().try_into().unwrap()); | ||
|
||
if let Some(color) = record.color() { | ||
let colors = [color.red(), color.green(), color.blue()]; | ||
for (i, val) in colors.iter().enumerate() { | ||
self.color | ||
.field_builder::<UInt8Builder>(i) | ||
.unwrap() | ||
.append_value(*val); | ||
} | ||
|
||
// Handle 1 or 2 missing colors | ||
if colors.len() == 2 { | ||
self.color | ||
.field_builder::<UInt8Builder>(2) | ||
.unwrap() | ||
.append_null(); | ||
} else if colors.len() == 1 { | ||
self.color | ||
.field_builder::<UInt8Builder>(1) | ||
.unwrap() | ||
.append_null(); | ||
self.color | ||
.field_builder::<UInt8Builder>(2) | ||
.unwrap() | ||
.append_null(); | ||
} | ||
Comment on lines
+145
to
+160
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A common issue when debugging struct arrays is having different numbers of records for the resulting Record Batch |
||
self.color.append(true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another arrow builder gotcha is the difference between |
||
} else { | ||
for i in 0..3 { | ||
self.color | ||
.field_builder::<UInt8Builder>(i) | ||
.unwrap() | ||
.append_null(); | ||
} | ||
self.color.append(true); | ||
} | ||
|
||
let (block_sizes, block_starts): (Vec<_>, Vec<_>) = record.blocks().iter().cloned().unzip(); | ||
let block_sizes_option_u64: Vec<Option<u64>> = block_sizes | ||
.into_iter() | ||
.map(|usize_value| Some(usize_value as u64)) | ||
.collect(); | ||
let block_starts_option_u64: Vec<Option<u64>> = block_starts | ||
.into_iter() | ||
.map(|usize_value| Some(usize_value as u64)) | ||
.collect(); | ||
self.block_starts.append_value(block_sizes_option_u64); | ||
self.block_sizes.append_value(block_starts_option_u64); | ||
} | ||
|
||
fn finish(mut self) -> Result<RecordBatch, ArrowError> { | ||
RecordBatch::try_from_iter(vec![ | ||
( | ||
"reference_sequence_name", | ||
Arc::new(self.reference_sequence_name.finish()) as ArrayRef, | ||
), | ||
( | ||
"start_position", | ||
Arc::new(self.start_position.finish()) as ArrayRef, | ||
), | ||
( | ||
"end_position", | ||
Arc::new(self.end_position.finish()) as ArrayRef, | ||
), | ||
("name", Arc::new(self.name.finish()) as ArrayRef), | ||
("score", Arc::new(self.score.finish()) as ArrayRef), | ||
("strand", Arc::new(self.strand.finish()) as ArrayRef), | ||
( | ||
"thick_start", | ||
Arc::new(self.thick_start.finish()) as ArrayRef, | ||
), | ||
("thick_end", Arc::new(self.thick_end.finish()) as ArrayRef), | ||
("color", Arc::new(self.color.finish()) as ArrayRef), | ||
( | ||
"block_sizes", | ||
Arc::new(self.block_sizes.finish()) as ArrayRef, | ||
), | ||
( | ||
"block_starts", | ||
Arc::new(self.block_starts.finish()) as ArrayRef, | ||
), | ||
]) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use arrow::ipc::reader::FileReader; | ||
use arrow::record_batch::RecordBatch; | ||
|
||
fn read_record_batch(fixture_path: &str) -> RecordBatch { | ||
let mut dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")); | ||
dir.push(fixture_path); | ||
let reader = new_from_path(dir.to_str().unwrap()).unwrap(); | ||
let ipc = records_to_ipc(reader).unwrap(); | ||
let cursor = std::io::Cursor::new(ipc); | ||
let mut arrow_reader = FileReader::try_new(cursor, None).unwrap(); | ||
// make sure we have one batch | ||
assert_eq!(arrow_reader.num_batches(), 1); | ||
arrow_reader.next().unwrap().unwrap() | ||
} | ||
|
||
//#[test] | ||
//fn test_read_all_bed3() { | ||
// let record_batch = read_record_batch("../fixtures/bed3.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
|
||
//#[test] | ||
//fn test_read_all_bed4() { | ||
// let record_batch = read_record_batch("../fixtures/bed4.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
|
||
//#[test] | ||
//fn test_read_all_bed5() { | ||
// let record_batch = read_record_batch("../fixtures/bed5.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
|
||
//#[test] | ||
//fn test_read_all_bed6() { | ||
// let record_batch = read_record_batch("../fixtures/bed6.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
|
||
//#[test] | ||
//fn test_read_all_bed7() { | ||
// let record_batch = read_record_batch("../fixtures/bed7.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
|
||
//#[test] | ||
//fn test_read_all_bed8() { | ||
// let record_batch = read_record_batch("../fixtures/bed8.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
|
||
//#[test] | ||
//fn test_read_all_bed9() { | ||
// let record_batch = read_record_batch("../fixtures/bed9.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
|
||
#[test] | ||
fn test_read_all_bed12() { | ||
let record_batch = read_record_batch("../fixtures/bed12.bed"); | ||
assert_eq!(record_batch.num_rows(), 2); | ||
} | ||
|
||
//#[test] | ||
//fn test_read_all_bed12plus() { | ||
// let record_batch = read_record_batch("../fixtures/bed12plus.bed"); | ||
// assert_eq!(record_batch.num_rows(), 2); | ||
//} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
|
||
pub mod bam; | ||
mod batch_builder; | ||
pub mod bed; | ||
pub mod fasta; | ||
pub mod fastq; | ||
pub mod vpos; | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
StructBuilder
for the arrow Struct Array takes the schema of the struct (the boolean sets the nullability of the field) and the builders for each field.