Read mdocfile data from a string (#28)

teamtomo · Aug 15, 2024 · 517bc8d · 517bc8d
1 parent 7084db9
commit 517bc8d
Show file tree

Hide file tree

Showing 7 changed files with 77 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -31,12 +31,23 @@ df = mdocfile.read('my_mdoc_file.mdoc')
 For writing valid mdoc files, please see 
 [writing mdoc files](https://teamtomo.org/mdocfile/writing/).
 
-
-
 # Installation
 
 pip:
 
 ```shell
 pip install mdocfile
 ```
+
+# Parsing from text
+
+`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. 
+This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). 
+
+```python
+from mdocfile.data_models import Mdoc
+
+mdoc_data = ...
+
+mdoc = Mdoc.from_string(mdoc_data).as_dataframe()
+```
diff --git a/docs/index.md b/docs/index.md
@@ -29,14 +29,29 @@ import mdocfile
 df = mdocfile.read('my_mdoc_file.mdoc')
 ```
 
----
-
 For writing valid mdoc files, please see [writing mdoc files](./writing.md).
 
+---
+
 # Installation
 
 pip:
 
 ```shell
 pip install mdocfile
-```
+```
+
+---
+
+# Parsing from text
+
+`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. 
+This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). 
+
+```python
+from mdocfile.data_models import Mdoc
+
+mdoc_data = ...
+
+mdoc = Mdoc.from_string(mdoc_data).as_dataframe()
+```
diff --git a/src/mdocfile/__init__.py b/src/mdocfile/__init__.py
@@ -1 +1 @@
-from .functions import read
+from .functions import read
diff --git a/src/mdocfile/data_models.py b/src/mdocfile/data_models.py
@@ -1,3 +1,4 @@
+import pandas as pd
 from pydantic import field_validator, BaseModel
 from pathlib import Path, PureWindowsPath
 from typing import List, Optional, Tuple, Union, Sequence
@@ -170,7 +171,17 @@ class Mdoc(BaseModel):
     @classmethod
     def from_file(cls, filename: str):
         with open(filename) as file:
-            lines = [line.strip() for line in file.readlines()]
+            return cls.from_lines(file.readlines())
+
+    @classmethod
+    def from_string(cls, string: str):
+        lines = string.split('\n')
+
+        return cls.from_lines(lines)
+
+    @classmethod
+    def from_lines(cls, file_lines: List[str]) -> 'Mdoc':
+        lines = [line.strip() for line in file_lines]
         split_idxs = find_section_entries(lines)
         split_idxs.append(len(lines))
 
@@ -185,6 +196,26 @@ def from_file(cls, filename: str):
             in zip(split_idxs, split_idxs[1:])
         ]
         return cls(titles=titles, global_data=global_data, section_data=section_data)
+
+    def as_dataframe(self) -> pd.DataFrame:
+        """
+        Convert an Mdoc object to a pandas DataFrame
+        """
+        global_data = self.global_data.model_dump()
+        section_data = {
+            k: [section.model_dump()[k] for section in self.section_data]
+            for k
+            in self.section_data[0].model_dump().keys()
+        }
+        df = pd.DataFrame(data=section_data)
+
+        # add duplicate copies of global data and mdoc file titles to each row of
+        # the dataframe - tidy data is easier to analyse
+        for k, v in global_data.items():
+            df[k] = [v] * len(df)
+        df['titles'] = [self.titles] * len(df)
+        df = df.dropna(axis='columns', how='all')
+        return df
 
     def to_string(self):
         """

diff --git a/src/mdocfile/functions.py b/src/mdocfile/functions.py
@@ -18,19 +18,4 @@ def read(filename: PathLike) -> pd.DataFrame:
     df : pd.DataFrame
         dataframe containing info from mdoc file
     """
-    mdoc = Mdoc.from_file(filename)
-    global_data = mdoc.global_data.model_dump()
-    section_data = {
-        k: [section.model_dump()[k] for section in mdoc.section_data]
-        for k
-        in mdoc.section_data[0].model_dump().keys()
-    }
-    df = pd.DataFrame(data=section_data)
-
-    # add duplicate copies of global data and mdoc file titles to each row of
-    # the dataframe - tidy data is easier to analyse
-    for k, v in global_data.items():
-        df[k] = [v] * len(df)
-    df['titles'] = [mdoc.titles] * len(df)
-    df = df.dropna(axis='columns', how='all')
-    return df
+    return Mdoc.from_file(filename).as_dataframe()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,11 +1,16 @@
-import pytest
 from pathlib import Path
 
+import pytest
+
 
 @pytest.fixture
 def tilt_series_mdoc_file():
     return Path(__file__).parent / 'test_data' / 'tilt_series.mdoc'
 
+@pytest.fixture
+def tilt_series_mdoc_string():
+    with open(Path(__file__).parent / 'test_data' / 'tilt_series.mdoc') as f:
+        return f.read()
 
 @pytest.fixture
 def montage_section_mdoc_file():

diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from mdocfile import read
+from mdocfile.data_models import Mdoc
 
 
 def test_read_tilt_series_mdoc(tilt_series_mdoc_file):
@@ -9,6 +10,11 @@ def test_read_tilt_series_mdoc(tilt_series_mdoc_file):
     assert df.shape == (41, 26)
     assert 'TiltAngle' in df.columns
 
+def test_read_tilt_series_mdoc_string(tilt_series_mdoc_string):
+    df = Mdoc.from_string(tilt_series_mdoc_string).as_dataframe()
+    assert isinstance(df, pd.DataFrame)
+    assert df.shape == (41, 26)
+    assert 'TiltAngle' in df.columns
 
 def test_read_montage_section_mdoc(montage_section_mdoc_file):
     df = read(montage_section_mdoc_file)