caporaso-lab · gregcaporaso · Oct 8, 2024 · Oct 8, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ Your first step will be filtering the distance matrix you'd like to use for the
 qiime diversity filter-distance-matrix \
 --i-distance-matrix unweighted-unifrac-distance-matrix.qza \
 --m-metadata-file final-analysis-metadata.tsv \
---p-where "[SampleType2] IN ('EMP-Soils', 'Food-Compost', 'Self Sample', 'Compost Post-Roll', 'Bulking Material')" \
+--p-where "[SampleType] IN ('Soil', 'Food Compost', 'Landscape Compost', 'Human Excrement', 'Human Excrement Compost', 'Bulking Material')" \
 --o-filtered-distance-matrix filtered-unweighted-unifrac-distance-matrix.qza
 ```
 
@@ -95,7 +95,7 @@ Now we're ready to generate a pcoa plot!
 qiime gut-to-soil-manuscript-figures pcoa-2d \
 --i-ordination filtered-unweighted-unifrac-2d-pcoa.qza \
 --m-metadata-file final-analysis-metadata.tsv \
---p-measure 'Unweighted Unifrac' \
+--p-measure 'Unweighted UniFrac' \
 --p-average \
 --p-export-legend \
 --p-highlighted-buckets '3, 4' \

diff --git a/gut_to_soil_manuscript_figures/_methods.py b/gut_to_soil_manuscript_figures/_methods.py
@@ -44,12 +44,11 @@ def pcoa_2d(output_dir: str, metadata: qiime2.Metadata,
     swap_axes = str(swap_axes)
     himalaya = str(himalaya)
     pit_toilet = str(pit_toilet)
-    export_legend = str(export_legend)
+    export_legend_str = str(export_legend)
 
     plot_fp = os.path.join(output_dir, 'pcoa_plot.png')
 
-    if export_legend:
-        legend_fp = os.path.join(output_dir, 'legend.png')
+    legend_fp = os.path.join(output_dir, 'legend.png')
 
     command = [
         'python', script_path,
@@ -64,7 +63,7 @@ def pcoa_2d(output_dir: str, metadata: qiime2.Metadata,
         swap_axes,
         himalaya,
         pit_toilet,
-        export_legend,
+        export_legend_str,
         highlighted_buckets,
         legend_fp
     ]
@@ -80,6 +79,13 @@ def pcoa_2d(output_dir: str, metadata: qiime2.Metadata,
         <body>
             <h1>2D PCoA Plot</h1>
             <img src="pcoa_plot.png" alt="PCoA Plot">
+        ''')
+        if export_legend:
+            f.write('''
+                    <p>
+                    <img src="legend.png" alt="PCoA Plot legend">
+                    ''')
+        f.write('''
         </body>
         </html>
         ''')
diff --git a/gut_to_soil_manuscript_figures/scripts/plot_pcoa_2d.py b/gut_to_soil_manuscript_figures/scripts/plot_pcoa_2d.py
@@ -43,11 +43,12 @@ def _bucket_util(highlighted_buckets, md, ord_2d):
         # connecting time series data in order
         md_bucket_sorted = \
             md[(md['Bucket'] == bucket) &
-               (md['SampleType2'] == 'Compost Post-Roll')].sort_values('Week')
+               (md['SampleType'] == 'Human Excrement Compost')].\
+            sort_values('Composting Time Point')
 
         # week 1-52 IDs for selected bucket
-        bucket_ids_sorted = \
-            md_bucket_sorted[md_bucket_sorted['Week'] > 0.0].index.values
+        bucket_ids_sorted = md_bucket_sorted[
+            md_bucket_sorted['Composting Time Point'] > 0.0].index.values
 
         # making sure bucket IDs used are only ones that are present in both
         # the md and ordination results
@@ -62,8 +63,7 @@ def _bucket_util(highlighted_buckets, md, ord_2d):
         # HE
         bucket_ids_HE_week0 = \
             md[(md['Bucket'] == bucket) &
-               (md['SampleType2'] == 'Self Sample') &
-               (md['Week'] == 0.0)].index.values
+               (md['SampleType'] == 'Human Excrement')].index.values
 
         ids_HE_week0 = []
         for i in bucket_ids_HE_week0:
@@ -73,17 +73,16 @@ def _bucket_util(highlighted_buckets, md, ord_2d):
         # bulking
         bucket_ids_bulk_week0 = \
             md[(md['Bucket'] == bucket) &
-               (md['SampleType2'] == 'Bulking Material') &
-               (md['Week'] == 0.0)].index.values
+               (md['SampleType'] == 'Bulking Material')].index.values
 
         ids_bulk_week0 = []
         for i in bucket_ids_bulk_week0:
             if i in ord_2d.index.values:
                 ids_bulk_week0.append(i)
 
         # week 1 i.e. end points for dotted line connecting HE & BM -> HEC
-        bucket_ids_HEC_week1 = \
-            md_bucket_sorted[md_bucket_sorted['Week'] == 1.0].index.values
+        bucket_ids_HEC_week1 = md_bucket_sorted[
+            md_bucket_sorted['Composting Time Point'] == 1.0].index.values
 
         ids_HEC_week1 = []
         for i in bucket_ids_HEC_week1:
@@ -141,8 +140,9 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
         ord_2d[1] = ord_2d[1].multiply(-1)
 
     # allowed sample types to be pulled from the md
-    sample_types = ['EMP-Soils', 'Food-Compost', 'Self Sample',
-                    'Compost Post-Roll', 'Bulking Material']
+    sample_types = ['Soil', 'Food Compost', 'Landscape Compost',
+                    'Human Excrement', 'Human Excrement Compost',
+                    'Bulking Material']
 
     # if using himalaya and/or pit toilet data
     if himalaya == 'True':
@@ -151,21 +151,23 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
         sample_types.append('Pit Toilet')
 
     # sorting the filtered md (by allowed sample types) by week
-    md = metadata[metadata['SampleType2']
-                  .isin(sample_types)].sort_values('Week')
+    md = metadata[metadata['SampleType']
+                  .isin(sample_types)].sort_values('Composting Time Point')
     md['Bucket'] = md['Bucket'].astype(float)
-    md['Week'] = md['Week'].astype(float)
+    md['Composting Time Point'] = md['Composting Time Point'].astype(float)
 
     buckets_md = md[md['Bucket'].between(1, 16)]
 
     # ALL SUBJECT FECAL SAMPLES: IDs -> XY ordination points
-    fecal_ids = list(set(buckets_md[buckets_md['Week'] == 0.0].index.values) &
-                     set(ord_2d.index.values))
+    fecal_ids = list(
+        set(buckets_md[buckets_md['SampleType'] == 'Human Excrement']
+            .index.values) &
+        set(ord_2d.index.values))
     x_fecal, y_fecal = _swap_axis(ord_2d, fecal_ids, swap_axes)
 
     # ALL SUBJECT BULKING MATERIAL: IDs -> XY ordination points
     bulking_ids = \
-        list(set(md[md['SampleType2'] == 'Bulking Material'].index.values) &
+        list(set(md[md['SampleType'] == 'Bulking Material'].index.values) &
              set(ord_2d.index.values))
     x_bulking, y_bulking = _swap_axis(ord_2d, bulking_ids, swap_axes)
 
@@ -176,8 +178,8 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
 
     # (OPTIONAL) WEEKLY MEAN FOR ALL BUCKETS: IDs -> XY ordination points
     if average == 'True':
-        weeks_md = md[md['Week'].between(1, 52)]
-        weeks = list(set(weeks_md['Week'].values))
+        weeks_md = md[md['Composting Time Point'].between(1, 52)]
+        weeks = list(set(weeks_md['Composting Time Point'].values))
 
         # dicts for each week's mean x&y values
         bucket_weekly_avgs_x = {}
@@ -188,9 +190,9 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
             y_list = []
 
             # filtering the md to only include post-roll sample types
-            weekly_bucket_ids = \
-                md[(md['Week'] == week) &
-                    (md['SampleType2'] == 'Compost Post-Roll')].index.values
+            weekly_bucket_ids = md[
+                (md['Composting Time Point'] == week) &
+                (md['SampleType'] == 'Human Excrement Compost')].index.values
 
             # only use IDs that are present both in the md and ordination
             included_ids = []
@@ -233,8 +235,7 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
         if not highlighted_buckets:
             # HE wk 0 mean
             HE_week0 = \
-                md[(md['SampleType2'] == 'Self Sample') &
-                   (md['Week'] == 0.0)].index.values
+                md[(md['SampleType'] == 'Human Excrement')].index.values
             ids_HE_week0 = []
             for i in HE_week0:
                 if i in ord_2d.index.values:
@@ -245,8 +246,7 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
 
             # bulk wk 0 mean
             bulk_week0 = \
-                md[(md['SampleType2'] == 'Bulking Material') &
-                   (md['Week'] == 0.0)].index.values
+                md[(md['SampleType'] == 'Bulking Material')].index.values
             ids_bulk_week0 = []
             for i in bulk_week0:
                 if i in ord_2d.index.values:
@@ -268,13 +268,14 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
     x_buckets, y_buckets = _swap_axis(ord_2d, bucket_ids, swap_axes)
 
     # EMP SOILS
-    emp_ids = md.loc[md['Bucket'] == 0.0].index.values
+    emp_ids = md.loc[md['SampleType'] == 'Soil'].index.values
     x_emp, y_emp = _swap_axis(ord_2d, emp_ids, swap_axes)
 
     # FOOD COMPOST
     compost_ids = \
-        list(set(md.loc[md['Bucket'] == 17.0].index.values) &
-             set(ord_2d.index.values))
+        list(set(md.loc[(md['SampleType'] == 'Food Compost') |
+                        (md['SampleType'] == 'Landscape Compost')].index.values)
+             & set(ord_2d.index.values))
     x_compost, y_compost = _swap_axis(ord_2d, compost_ids, swap_axes)
 
     # (OPTIONAL SAMPLE TYPES) HIMALAYA
@@ -424,7 +425,7 @@ def plot_pcoa_2d(metadata_fp, ordination_fp, measure,
 
             # adding week annotations for each highlighted bucket
             if week_annotations == 'True':
-                for week, x, y in zip((md.loc[ids]['Week']),
+                for week, x, y in zip((md.loc[ids]['Composting Time Point']),
                                       x_bucket, y_bucket):
                     week_int = int(week)
                     ax.annotate(str(week_int), weight='bold', color='purple',