Skip to content

Commit

Permalink
Read mdocfile data from a string (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel-ji authored Aug 15, 2024
1 parent 7084db9 commit 517bc8d
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 24 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,23 @@ df = mdocfile.read('my_mdoc_file.mdoc')
For writing valid mdoc files, please see
[writing mdoc files](https://teamtomo.org/mdocfile/writing/).



# Installation

pip:

```shell
pip install mdocfile
```

# Parsing from text

`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe.
This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request).

```python
from mdocfile.data_models import Mdoc

mdoc_data = ...

mdoc = Mdoc.from_string(mdoc_data).as_dataframe()
```
21 changes: 18 additions & 3 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,29 @@ import mdocfile
df = mdocfile.read('my_mdoc_file.mdoc')
```

---

For writing valid mdoc files, please see [writing mdoc files](./writing.md).

---

# Installation

pip:

```shell
pip install mdocfile
```
```

---

# Parsing from text

`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe.
This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request).

```python
from mdocfile.data_models import Mdoc

mdoc_data = ...

mdoc = Mdoc.from_string(mdoc_data).as_dataframe()
```
2 changes: 1 addition & 1 deletion src/mdocfile/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .functions import read
from .functions import read
33 changes: 32 additions & 1 deletion src/mdocfile/data_models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
from pydantic import field_validator, BaseModel
from pathlib import Path, PureWindowsPath
from typing import List, Optional, Tuple, Union, Sequence
Expand Down Expand Up @@ -170,7 +171,17 @@ class Mdoc(BaseModel):
@classmethod
def from_file(cls, filename: str):
with open(filename) as file:
lines = [line.strip() for line in file.readlines()]
return cls.from_lines(file.readlines())

@classmethod
def from_string(cls, string: str):
lines = string.split('\n')

return cls.from_lines(lines)

@classmethod
def from_lines(cls, file_lines: List[str]) -> 'Mdoc':
lines = [line.strip() for line in file_lines]
split_idxs = find_section_entries(lines)
split_idxs.append(len(lines))

Expand All @@ -185,6 +196,26 @@ def from_file(cls, filename: str):
in zip(split_idxs, split_idxs[1:])
]
return cls(titles=titles, global_data=global_data, section_data=section_data)

def as_dataframe(self) -> pd.DataFrame:
"""
Convert an Mdoc object to a pandas DataFrame
"""
global_data = self.global_data.model_dump()
section_data = {
k: [section.model_dump()[k] for section in self.section_data]
for k
in self.section_data[0].model_dump().keys()
}
df = pd.DataFrame(data=section_data)

# add duplicate copies of global data and mdoc file titles to each row of
# the dataframe - tidy data is easier to analyse
for k, v in global_data.items():
df[k] = [v] * len(df)
df['titles'] = [self.titles] * len(df)
df = df.dropna(axis='columns', how='all')
return df

def to_string(self):
"""
Expand Down
17 changes: 1 addition & 16 deletions src/mdocfile/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,4 @@ def read(filename: PathLike) -> pd.DataFrame:
df : pd.DataFrame
dataframe containing info from mdoc file
"""
mdoc = Mdoc.from_file(filename)
global_data = mdoc.global_data.model_dump()
section_data = {
k: [section.model_dump()[k] for section in mdoc.section_data]
for k
in mdoc.section_data[0].model_dump().keys()
}
df = pd.DataFrame(data=section_data)

# add duplicate copies of global data and mdoc file titles to each row of
# the dataframe - tidy data is easier to analyse
for k, v in global_data.items():
df[k] = [v] * len(df)
df['titles'] = [mdoc.titles] * len(df)
df = df.dropna(axis='columns', how='all')
return df
return Mdoc.from_file(filename).as_dataframe()
7 changes: 6 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import pytest
from pathlib import Path

import pytest


@pytest.fixture
def tilt_series_mdoc_file():
return Path(__file__).parent / 'test_data' / 'tilt_series.mdoc'

@pytest.fixture
def tilt_series_mdoc_string():
with open(Path(__file__).parent / 'test_data' / 'tilt_series.mdoc') as f:
return f.read()

@pytest.fixture
def montage_section_mdoc_file():
Expand Down
6 changes: 6 additions & 0 deletions tests/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd

from mdocfile import read
from mdocfile.data_models import Mdoc


def test_read_tilt_series_mdoc(tilt_series_mdoc_file):
Expand All @@ -9,6 +10,11 @@ def test_read_tilt_series_mdoc(tilt_series_mdoc_file):
assert df.shape == (41, 26)
assert 'TiltAngle' in df.columns

def test_read_tilt_series_mdoc_string(tilt_series_mdoc_string):
df = Mdoc.from_string(tilt_series_mdoc_string).as_dataframe()
assert isinstance(df, pd.DataFrame)
assert df.shape == (41, 26)
assert 'TiltAngle' in df.columns

def test_read_montage_section_mdoc(montage_section_mdoc_file):
df = read(montage_section_mdoc_file)
Expand Down

0 comments on commit 517bc8d

Please sign in to comment.