Skip to content

Commit

Permalink
Working on returning references but there's a serialization error
Browse files Browse the repository at this point in the history
  • Loading branch information
pkerpedjiev committed Sep 16, 2023
1 parent 014fb2c commit 43b2340
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 5 deletions.
57 changes: 52 additions & 5 deletions oxbow/src/bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ use arrow::array::{
};
use arrow::{datatypes::Int32Type, error::ArrowError, record_batch::RecordBatch};
use noodles::core::Region;
use noodles::vcf::record::info::field::key::SV_LENGTHS;
use noodles::{bam, bgzf, csi, sam};
use arrow::ipc::writer::FileWriter;

use crate::batch_builder::{write_ipc_err, BatchBuilder};

Expand Down Expand Up @@ -125,6 +127,37 @@ impl<R: Read + Seek> BamReader<R> {
.map(|i| i.map_err(|e| ArrowError::ExternalError(e.into())));
write_ipc_err(records, batch_builder)
}

/// Returns the reference sequences in the BAM in Apache Arrow IPC. ///
/// # Examples
///
/// ```no_run
/// use oxbow::bam::BamReader;
///
/// let mut reader = BamReader::new_from_path("sample.bam").unwrap();
/// let ipc = reader.references_to_ipc().unwrap();
/// ```
pub fn references_to_ipc(
&mut self,
) -> Result<Vec<u8>, ArrowError> {
let mut names = GenericStringBuilder::<i32>::new();
let mut lengths = Int32Array::builder(1024);

for (name, sequence) in self.header.reference_sequences() {
names.append_value(name.as_str());
lengths.append_value(sequence.length().get() as i32);
}

let batch = RecordBatch::try_from_iter(vec![
("name", Arc::new(names.finish()) as ArrayRef),
("length", Arc::new(lengths.finish()) as ArrayRef),
])?;

let mut writer = FileWriter::try_new(Vec::new(), &batch.schema())?;
writer.write(&batch)?;
writer.finish()?;
writer.into_inner()
}
}

struct BamBatchBuilder<'a> {
Expand Down Expand Up @@ -290,18 +323,22 @@ mod tests {
use arrow::ipc::reader::FileReader;
use arrow::record_batch::RecordBatch;

fn read_record_batch(region: Option<&str>) -> RecordBatch {
let mut dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
dir.push("../fixtures/sample.bam");
let mut reader = BamReader::new_from_path(dir.to_str().unwrap()).unwrap();
let ipc = reader.records_to_ipc(region).unwrap();
fn record_batch_from_ipc(ipc: Vec<u8>) -> RecordBatch {
let cursor = std::io::Cursor::new(ipc);
let mut arrow_reader = FileReader::try_new(cursor, None).unwrap();
// make sure we have one batch
assert_eq!(arrow_reader.num_batches(), 1);
arrow_reader.next().unwrap().unwrap()
}

fn read_record_batch(region: Option<&str>) -> RecordBatch {
let mut dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
dir.push("../fixtures/sample.bam");
let mut reader = BamReader::new_from_path(dir.to_str().unwrap()).unwrap();
let ipc = reader.records_to_ipc(region).unwrap();
record_batch_from_ipc(ipc)
}

#[test]
fn test_read_all() {
let record_batch = read_record_batch(None);
Expand All @@ -319,4 +356,14 @@ mod tests {
let record_batch = read_record_batch(Some("chr1:1-100000"));
assert_eq!(record_batch.num_rows(), 2);
}

#[test]
fn test_references() {
let mut dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
dir.push("../fixtures/sample.bam");
let mut reader = BamReader::new_from_path(dir.to_str().unwrap()).unwrap();

let ipc = reader.references_to_ipc().unwrap();
dbg!(&ipc);
}
}
29 changes: 29 additions & 0 deletions py-oxbow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,34 @@ fn read_bam_vpos(
}
}

#[pyfunction]
fn read_bam_references(
py: Python,
path_or_file_like: PyObject,
index: Option<PyObject>,
) -> PyObject {
if let Ok(string_ref) = path_or_file_like.downcast::<PyString>(py) {
// If it's a string, treat it as a path
let mut reader = BamReader::new_from_path(string_ref.to_str().unwrap()).unwrap();
let ipc = reader.references_to_ipc().unwrap();
Python::with_gil(|py| PyBytes::new(py, &ipc).into())
} else {
// Otherwise, treat it as file-like
let file_like = match PyFileLikeObject::new(path_or_file_like, true, false, true) {
Ok(file_like) => file_like,
Err(_) => panic!("Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object."),
};
let index_file_like = match PyFileLikeObject::new(index.unwrap(), true, false, true) {
Ok(file_like) => file_like,
Err(_) => panic!("Unknown argument for `index`. Not a file path string or url, and not a file-like object."),
};
let index = bam::index_from_reader(index_file_like).unwrap();
let mut reader = BamReader::new(file_like, index).unwrap();
let ipc = reader.references_to_ipc().unwrap();
Python::with_gil(|py| PyBytes::new(py, &ipc).into())
}
}

#[pyfunction]
fn read_vcf(
py: Python,
Expand Down Expand Up @@ -292,6 +320,7 @@ fn py_oxbow(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(partition_from_index_file, m)?)?;
m.add_function(wrap_pyfunction!(read_bam, m)?)?;
m.add_function(wrap_pyfunction!(read_bam_vpos, m)?)?;
m.add_function(wrap_pyfunction!(read_bam_references, m)?)?;
// m.add_function(wrap_pyfunction!(read_cram, m)?)?;
// m.add_function(wrap_pyfunction!(read_cram_vpos, m)?)?;
m.add_function(wrap_pyfunction!(read_vcf, m)?)?;
Expand Down

0 comments on commit 43b2340

Please sign in to comment.