From 9a3347f750e64e9605c0aa33b5a3f6b83ef3c634 Mon Sep 17 00:00:00 2001
From: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>
Date: Sat, 18 Jan 2025 11:38:34 +0200
Subject: [PATCH] Create upheno-paper.ipynb

---
 src/scripts/upheno-paper.ipynb | 860 +++++++++++++++++++++++++++++++++
 1 file changed, 860 insertions(+)
 create mode 100644 src/scripts/upheno-paper.ipynb
diff --git a/src/scripts/upheno-paper.ipynb b/src/scripts/upheno-paper.ipynb
new file mode 100644
index 00000000..fae70476
--- /dev/null
+++ b/src/scripts/upheno-paper.ipynb
@@ -0,0 +1,860 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import sys\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "profile = sys.argv[1]\n",
+    "profile = \"all\"\n",
+    "stopwords = ['abnormally','abnormal','aberrant','variant']\n",
+    "outdir = \"../curation/data\"\n",
+    "uphenorelease_dir = \"../curation/upheno-release/{}/\".format(profile)\n",
+    "\n",
+    "## IN\n",
+    "upheno_mapping_logical = os.path.join(uphenorelease_dir,\"upheno_mapping_logical.csv\")\n",
+    "upheno_species_lexical_file = os.path.join(uphenorelease_dir,\"upheno_species_lexical.csv\")\n",
+    "\n",
+    "## OUT\n",
+    "upheno_mapping_all = os.path.join(uphenorelease_dir,\"upheno_mapping_all.csv\")\n",
+    "upheno_mapping_lexical = os.path.join(uphenorelease_dir,\"upheno_mapping_lexical.csv\")\n",
+    "upheno_mapping_lexical_template = os.path.join(uphenorelease_dir,\"upheno_mapping_lexical_template.csv\")\n",
+    "upheno_mapping_problematic = os.path.join(uphenorelease_dir,\"upheno_mapping_problematic.csv\")\n",
+    "\n",
+    "## Load lexical data\n",
+    "df = pd.read_csv(upheno_species_lexical_file)\n",
+    "df.columns = ['iri','p','label']\n",
+    "\n",
+    "## Load logical mappings\n",
+    "dfl1 = pd.read_csv(upheno_mapping_logical)[['p1','p2']]\n",
+    "dfl2 = dfl1.copy()\n",
+    "dfl2.columns = ['p2','p1']\n",
+    "dfl = pd.concat([dfl1, dfl2], ignore_index=True, sort =False)\n",
+    "dfl = dfl.drop_duplicates()\n",
+    "dfl['cat']=\"logical\"\n",
+    "\n",
+    "## Prepare dataframe for labels\n",
+    "df_label = df[df['p']==\"http://www.w3.org/2000/01/rdf-schema#label\"][['iri','label']]\n",
+    "df_label.columns = ['iri','label']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>p1</th>\n",
+       "      <th>p2</th>\n",
+       "      <th>cat</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0001634</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0011029</td>\n",
+       "      <td>logical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0011373</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0003148</td>\n",
+       "      <td>logical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0003769</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000159</td>\n",
+       "      <td>logical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/FBcv_0000708</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/ZP_0009477</td>\n",
+       "      <td>logical</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/WBPhenotype_000...</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/ZP_0009477</td>\n",
+       "      <td>logical</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  p1  \\\n",
+       "0          http://purl.obolibrary.org/obo/MP_0001634   \n",
+       "1          http://purl.obolibrary.org/obo/HP_0011373   \n",
+       "2          http://purl.obolibrary.org/obo/MP_0003769   \n",
+       "3        http://purl.obolibrary.org/obo/FBcv_0000708   \n",
+       "4  http://purl.obolibrary.org/obo/WBPhenotype_000...   \n",
+       "\n",
+       "                                          p2      cat  \n",
+       "0  http://purl.obolibrary.org/obo/HP_0011029  logical  \n",
+       "1  http://purl.obolibrary.org/obo/MP_0003148  logical  \n",
+       "2  http://purl.obolibrary.org/obo/HP_0000159  logical  \n",
+       "3  http://purl.obolibrary.org/obo/ZP_0009477  logical  \n",
+       "4  http://purl.obolibrary.org/obo/ZP_0009477  logical  "
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfl.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>iri</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/UPHENO_0001001</td>\n",
+       "      <td>Phenotype</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/UPHENO_0012047</td>\n",
+       "      <td>Enhanced behavior process</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/UPHENO_0055087</td>\n",
+       "      <td>increased thigmotaxis</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0002797</td>\n",
+       "      <td>increased thigmotaxis (MPO)</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/UPHENO_0054934</td>\n",
+       "      <td>increased vertical activity</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             iri                        label\n",
+       "0  http://purl.obolibrary.org/obo/UPHENO_0001001                    Phenotype\n",
+       "1  http://purl.obolibrary.org/obo/UPHENO_0012047    Enhanced behavior process\n",
+       "2  http://purl.obolibrary.org/obo/UPHENO_0055087        increased thigmotaxis\n",
+       "3      http://purl.obolibrary.org/obo/MP_0002797  increased thigmotaxis (MPO)\n",
+       "4  http://purl.obolibrary.org/obo/UPHENO_0054934  increased vertical activity"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_label.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "158626\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Preprocess labels. The most important aspect to this the stopword removal. this is done by matching a stopword\n",
+    "# that means 'abnormal', removing it and then adding the actual prefix 'abnormal'. For example, \"cell morphology, aberrant\"\n",
+    "# will become 'abnormal cell morphology'. Other than that, most special characters other than space and the ' tick-mark\n",
+    "# Are removed\n",
+    "\n",
+    "def apply_stopword(x, stopword):\n",
+    "    if x:\n",
+    "        if stopword in x:\n",
+    "            x = \"abnormal \"+x.replace(stopword, '')\n",
+    "    return x\n",
+    "\n",
+    "def preprocess_labels(df, stopwords):\n",
+    "    df['label'] = df['label'].astype(str)\n",
+    "    df['label_pp'] = df['label'].str.replace(r\"[(][A-Z]+[)]\", \"\")\n",
+    "    df['label_pp'] = df['label_pp'].str.lower()\n",
+    "    df['label_pp'] = df['label_pp'].str.replace(r\"[^0-9a-z' ]\", \"\")\n",
+    "\n",
+    "    for stopword in stopwords:\n",
+    "        df['label_pp'] = df['label_pp'].apply(lambda x: apply_stopword(x,stopword))\n",
+    "\n",
+    "    df['label_pp'] = df['label_pp'].str.strip()\n",
+    "    df['label_pp'] = df['label_pp'].str.replace(r\"[ ]+\", \" \")\n",
+    "    df=df[~df['iri'].astype(str).str.startswith('http://purl.obolibrary.org/obo/UPHENO_')]\n",
+    "    df=df[df['label_pp']!=\"\"]\n",
+    "    d=df[['iri','label_pp']]\n",
+    "    d.columns=['iri','label']\n",
+    "    d=d.drop_duplicates()\n",
+    "    return d\n",
+    "\n",
+    "d = preprocess_labels(df,stopwords)\n",
+    "l = df_label[~df_label['iri'].astype(str).str.startswith('http://purl.obolibrary.org/obo/UPHENO_')]\n",
+    "print(len(d))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dd=d.groupby('label')['iri'].apply(list).to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-----------------------\n",
+      "abnormal protein transport\n",
+      "['http://purl.obolibrary.org/obo/WBPhenotype_0001864', 'http://purl.obolibrary.org/obo/FYPO_0005744', 'http://purl.obolibrary.org/obo/PHIPO_0000265', 'http://purl.obolibrary.org/obo/FYPO_0000540']\n",
+      "-----------------------\n",
+      "abnormal rna localization\n",
+      "['http://purl.obolibrary.org/obo/FYPO_0003057', 'http://purl.obolibrary.org/obo/PHIPO_0000251', 'http://purl.obolibrary.org/obo/FYPO_0000360']\n",
+      "-----------------------\n",
+      "altered cellular cyclic amp level\n",
+      "['http://purl.obolibrary.org/obo/FYPO_0001659', 'http://purl.obolibrary.org/obo/PHIPO_0001022', 'http://purl.obolibrary.org/obo/PHIPO_0001023']\n",
+      "-----------------------\n",
+      "asd\n",
+      "['http://purl.obolibrary.org/obo/HP_0000729', 'http://purl.obolibrary.org/obo/HP_0001631', 'http://purl.obolibrary.org/obo/MP_0010403']\n",
+      "-----------------------\n",
+      "lethal\n",
+      "['http://purl.obolibrary.org/obo/WBPhenotype_0000062', 'http://purl.obolibrary.org/obo/FBcv_0000351', 'http://purl.obolibrary.org/obo/PHIPO_0000019', 'http://purl.obolibrary.org/obo/PHIPO_0000513']\n",
+      "70\n",
+      "5\n",
+      "154502\n"
+     ]
+    }
+   ],
+   "source": [
+    "# This step is a complicated hack that tries to get rid of them of the false exact synonyms. \n",
+    "# The idea is this: if there is an exact synonym between two terms within an ontology, we get rid of the link. \n",
+    "# Sometimes, however, a synonym is shared between more than one term within and ontology and across: \n",
+    "# These cases need to be\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "def get_dupes(a):\n",
+    "    seen = {}\n",
+    "    dupes = []\n",
+    "\n",
+    "    for x in a:\n",
+    "        if x not in seen:\n",
+    "            seen[x] = 1\n",
+    "        else:\n",
+    "            if seen[x] == 1:\n",
+    "                dupes.append(x)\n",
+    "            seen[x] += 1\n",
+    "    return dupes\n",
+    "\n",
+    "cases = dict()\n",
+    "cases_internal = dict()\n",
+    "i = 0\n",
+    "\n",
+    "exclude_synonyms = dict()\n",
+    "\n",
+    "for label in dd:\n",
+    "    iris = dd.get(label)\n",
+    "    onts = [re.sub('[_][0-9]+', '', iri.replace(\"http://purl.obolibrary.org/obo/\",\"\")) for iri in iris]\n",
+    "    if len(onts)>1:\n",
+    "        if len(onts) != len(set(onts)):\n",
+    "            if len(set(onts))>1:\n",
+    "                cases[label] = iris\n",
+    "                print(\"-----------------------\")\n",
+    "                print(label)\n",
+    "                print(iris)\n",
+    "                dupes = get_dupes(onts)\n",
+    "                for dupe in dupes:\n",
+    "                    for iri in iris:\n",
+    "                        if dupe in iri:\n",
+    "                            if label not in exclude_synonyms:\n",
+    "                                exclude_synonyms[label]=[]\n",
+    "                            exclude_synonyms[label].append(iri)\n",
+    "            else:\n",
+    "                cases_internal[label] = iris\n",
+    "                for iri in iris:\n",
+    "                    if label not in exclude_synonyms:\n",
+    "                        exclude_synonyms[label]=[]\n",
+    "                    exclude_synonyms[label].append(iri)\n",
+    "\n",
+    "\n",
+    "print(len(cases_internal))\n",
+    "print(len(cases))\n",
+    "print(len(dd))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "158626\n",
+      "158476\n",
+      "250437\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Remove all those IRIs that contained duplicates determined in the previous step\n",
+    "d=x\n",
+    "print(len(d))\n",
+    "for label in exclude_synonyms:\n",
+    "    for iri in exclude_synonyms[label]:\n",
+    "        d = d[~((d['iri']==iri) & (d['label']==label))]\n",
+    "print(len(d))\n",
+    "d = pd.merge(d,l,on=['iri','label'],how='outer')\n",
+    "print(len(d))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dd=d.groupby('label')['iri'].apply(list).to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# \n",
+    "def pairwise(t):\n",
+    "    it = iter(t)\n",
+    "    return zip(it,it)\n",
+    "\n",
+    "def invert_dol_nonunique(d):\n",
+    "    newdict = {}\n",
+    "    for k in d:\n",
+    "        for v in d[k]:\n",
+    "            newdict.setdefault(v, []).append(k)\n",
+    "    return newdict\n",
+    "\n",
+    "def merge_label_equivalent_cliques(dd_rv):\n",
+    "    merge_labels = dict()\n",
+    "    for iri in dd_rv:\n",
+    "        labels_to_merge = dd_rv.get(iri)\n",
+    "        if len(labels_to_merge)>1:\n",
+    "            for lab in labels_to_merge:\n",
+    "                if lab not in merge_labels:\n",
+    "                    merge_labels[lab] = []\n",
+    "                merge_labels[lab] = list(set(merge_labels[lab]+labels_to_merge))\n",
+    "    return merge_labels\n",
+    "\n",
+    "dd_rv = invert_dol_nonunique(dd)\n",
+    "merge_labels = merge_label_equivalent_cliques(dd_rv)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>iri</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>51713</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0011138</td>\n",
+       "      <td>Abnormality of skin adnexa morphology (HPO)</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             iri  \\\n",
+       "51713  http://purl.obolibrary.org/obo/HP_0011138   \n",
+       "\n",
+       "                                             label  \n",
+       "51713  Abnormality of skin adnexa morphology (HPO)  "
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "l[l['iri']==\"http://purl.obolibrary.org/obo/HP_0011138\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6612\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>p1</th>\n",
+       "      <th>p2</th>\n",
+       "      <th>cat</th>\n",
+       "      <th>label_x</th>\n",
+       "      <th>label_y</th>\n",
+       "      <th>o1</th>\n",
+       "      <th>o2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0002639</td>\n",
+       "      <td>lexical</td>\n",
+       "      <td>Micrognathia (HPO)</td>\n",
+       "      <td>micrognathia (MPO)</td>\n",
+       "      <td>HP</td>\n",
+       "      <td>MP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0002639</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "      <td>lexical</td>\n",
+       "      <td>micrognathia (MPO)</td>\n",
+       "      <td>Micrognathia (HPO)</td>\n",
+       "      <td>MP</td>\n",
+       "      <td>HP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0004592</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "      <td>lexical</td>\n",
+       "      <td>small mandible (MPO)</td>\n",
+       "      <td>Micrognathia (HPO)</td>\n",
+       "      <td>MP</td>\n",
+       "      <td>HP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0004592</td>\n",
+       "      <td>lexical</td>\n",
+       "      <td>Micrognathia (HPO)</td>\n",
+       "      <td>small mandible (MPO)</td>\n",
+       "      <td>HP</td>\n",
+       "      <td>MP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000327</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0004540</td>\n",
+       "      <td>lexical</td>\n",
+       "      <td>Hypoplasia of the maxilla (HPO)</td>\n",
+       "      <td>small maxilla (MPO)</td>\n",
+       "      <td>HP</td>\n",
+       "      <td>MP</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          p1  \\\n",
+       "0  http://purl.obolibrary.org/obo/HP_0000347   \n",
+       "1  http://purl.obolibrary.org/obo/MP_0002639   \n",
+       "2  http://purl.obolibrary.org/obo/MP_0004592   \n",
+       "3  http://purl.obolibrary.org/obo/HP_0000347   \n",
+       "4  http://purl.obolibrary.org/obo/HP_0000327   \n",
+       "\n",
+       "                                          p2      cat  \\\n",
+       "0  http://purl.obolibrary.org/obo/MP_0002639  lexical   \n",
+       "1  http://purl.obolibrary.org/obo/HP_0000347  lexical   \n",
+       "2  http://purl.obolibrary.org/obo/HP_0000347  lexical   \n",
+       "3  http://purl.obolibrary.org/obo/MP_0004592  lexical   \n",
+       "4  http://purl.obolibrary.org/obo/MP_0004540  lexical   \n",
+       "\n",
+       "                           label_x               label_y  o1  o2  \n",
+       "0               Micrognathia (HPO)    micrognathia (MPO)  HP  MP  \n",
+       "1               micrognathia (MPO)    Micrognathia (HPO)  MP  HP  \n",
+       "2             small mandible (MPO)    Micrognathia (HPO)  MP  HP  \n",
+       "3               Micrognathia (HPO)  small mandible (MPO)  HP  MP  \n",
+       "4  Hypoplasia of the maxilla (HPO)   small maxilla (MPO)  HP  MP  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def compute_mappings(dd,l):\n",
+    "    data = []\n",
+    "    done = set()\n",
+    "    for label in dd:\n",
+    "        if label in done:\n",
+    "            continue\n",
+    "        done.add(label)\n",
+    "        iris = dd.get(label)\n",
+    "        if label in merge_labels:\n",
+    "            for lab in merge_labels[label]:\n",
+    "                iris.extend(dd.get(lab))\n",
+    "                done.add(lab)\n",
+    "        iris = list(set(iris))\n",
+    "        if len(iris)>1:\n",
+    "            #print(iris)\n",
+    "            pairs = pairwise(iris)\n",
+    "            for pair in pairs:\n",
+    "                data.append([pair[0], pair[1]])\n",
+    "                data.append([pair[1], pair[0]])\n",
+    "    df_mappings =  pd.DataFrame.from_records(data)\n",
+    "    df_mappings = df_mappings.drop_duplicates()\n",
+    "    df_mappings['cat'] = 'lexical'\n",
+    "    df_mappings.columns = ['p1','p2','cat']\n",
+    "    df_maps = pd.merge(df_mappings,l,  how='left', left_on=['p1'], right_on=['iri'])\n",
+    "    df_maps=df_maps.drop('iri',1)\n",
+    "    df_maps = pd.merge(df_maps, l,  how='left', left_on=['p2'], right_on=['iri'])\n",
+    "    df_maps=df_maps.drop('iri',1)\n",
+    "    df_maps['o1']=[re.sub('[_][0-9]+', '', iri.replace(\"http://purl.obolibrary.org/obo/\",\"\")) for iri in df_maps['p1'].values]\n",
+    "    df_maps['o2']=[re.sub('[_][0-9]+', '', iri.replace(\"http://purl.obolibrary.org/obo/\",\"\")) for iri in df_maps['p2'].values]\n",
+    "    return df_maps\n",
+    "\n",
+    "df_mapping = compute_mappings(dd,l)\n",
+    "print(len(df_mapping))\n",
+    "df_mapping.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "128\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Ontology ID</th>\n",
+       "      <th>EquivalentClasses</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ID</td>\n",
+       "      <td>EC %</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0002639</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0002639</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0004592</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>http://purl.obolibrary.org/obo/HP_0000347</td>\n",
+       "      <td>http://purl.obolibrary.org/obo/MP_0004592</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                 Ontology ID  \\\n",
+       "0                                         ID   \n",
+       "1  http://purl.obolibrary.org/obo/HP_0000347   \n",
+       "2  http://purl.obolibrary.org/obo/MP_0002639   \n",
+       "3  http://purl.obolibrary.org/obo/MP_0004592   \n",
+       "4  http://purl.obolibrary.org/obo/HP_0000347   \n",
+       "\n",
+       "                           EquivalentClasses  \n",
+       "0                                       EC %  \n",
+       "1  http://purl.obolibrary.org/obo/MP_0002639  \n",
+       "2  http://purl.obolibrary.org/obo/HP_0000347  \n",
+       "3  http://purl.obolibrary.org/obo/HP_0000347  \n",
+       "4  http://purl.obolibrary.org/obo/MP_0004592  "
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "## Step to investigate why there are mappings of terms within the same ontology.. \n",
+    "## Since exact synonyms and labels were used, no such mapping should exist\n",
+    "## We drop them\n",
+    "\n",
+    "w=df_mapping[df_mapping['o1']==df_mapping['o2']]\n",
+    "df_maps = df_mapping[df_mapping['o1']!=df_mapping['o2']]\n",
+    "print(len(w))\n",
+    "w.to_csv(upheno_mapping_problematic,index=False)\n",
+    "#df_maps\n",
+    "# print(df_mapping[df_mapping['p1']==\"http://purl.obolibrary.org/obo/ZP_0006897\"])\n",
+    "\n",
+    "df_mapping_template = df_mapping[['p1','p2']].copy()\n",
+    "df_mapping_template.columns = ['Ontology ID','EquivalentClasses']\n",
+    "\n",
+    "df_mapping_template.loc[-1] = ['ID', 'AI obo:UPHENO_0000002']  # adding a row\n",
+    "df_mapping_template.index = df_mapping_template.index + 1  # shifting index\n",
+    "df_mapping_template.sort_index(inplace=True) \n",
+    "\n",
+    "df_mapping.to_csv(upheno_mapping_lexical,index=False)\n",
+    "df_mapping_template.to_csv(upheno_mapping_lexical_template,index=False)\n",
+    "\n",
+    "w[['p1','p2']]\n",
+    "df_mapping_template.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merging the logical mappings with the lexical ones for comparison\n",
+    "print(df_maps.head())\n",
+    "df_m = pd.merge(df_maps[['p1','p2','cat']], dfl,  how='outer', on=['p1','p2'])\n",
+    "df_m = pd.merge(df_m,l,  how='left', left_on=['p1'], right_on=['iri'])\n",
+    "df_m=df_m.drop('iri',1)\n",
+    "df_m = pd.merge(df_m, l,  how='left', left_on=['p2'], right_on=['iri'])\n",
+    "df_m=df_m.drop('iri',1)\n",
+    "df_m['cat'] = df_m[\"cat_x\"].astype(str)+\"-\" + df_m[\"cat_y\"].astype(str)\n",
+    "df_m['cat'] = df_m['cat'].str.replace(\"-nan\", \"\")\n",
+    "df_m['cat'] = df_m['cat'].str.replace(\"nan-\", \"\")\n",
+    "df_m=df_m.drop('cat_x',1)\n",
+    "df_m=df_m.drop('cat_y',1)\n",
+    "\n",
+    "print(df_m['cat'].value_counts(normalize=True))\n",
+    "print(df_m['cat'].value_counts())\n",
+    "\n",
+    "df_m.to_csv(upheno_mapping_all,index=False)\n",
+    "\n",
+    "df_m.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.5"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	p1	p2	cat
0	http://purl.obolibrary.org/obo/MP_0001634	http://purl.obolibrary.org/obo/HP_0011029	logical
1	http://purl.obolibrary.org/obo/HP_0011373	http://purl.obolibrary.org/obo/MP_0003148	logical
2	http://purl.obolibrary.org/obo/MP_0003769	http://purl.obolibrary.org/obo/HP_0000159	logical
3	http://purl.obolibrary.org/obo/FBcv_0000708	http://purl.obolibrary.org/obo/ZP_0009477	logical
4	http://purl.obolibrary.org/obo/WBPhenotype_000...	http://purl.obolibrary.org/obo/ZP_0009477	logical
	iri	label
0	http://purl.obolibrary.org/obo/UPHENO_0001001	Phenotype
1	http://purl.obolibrary.org/obo/UPHENO_0012047	Enhanced behavior process
2	http://purl.obolibrary.org/obo/UPHENO_0055087	increased thigmotaxis
3	http://purl.obolibrary.org/obo/MP_0002797	increased thigmotaxis (MPO)
4	http://purl.obolibrary.org/obo/UPHENO_0054934	increased vertical activity
	p1	p2	cat	label_x	label_y	o1	o2
0	http://purl.obolibrary.org/obo/HP_0000347	http://purl.obolibrary.org/obo/MP_0002639	lexical	Micrognathia (HPO)	micrognathia (MPO)	HP	MP
1	http://purl.obolibrary.org/obo/MP_0002639	http://purl.obolibrary.org/obo/HP_0000347	lexical	micrognathia (MPO)	Micrognathia (HPO)	MP	HP
2	http://purl.obolibrary.org/obo/MP_0004592	http://purl.obolibrary.org/obo/HP_0000347	lexical	small mandible (MPO)	Micrognathia (HPO)	MP	HP
3	http://purl.obolibrary.org/obo/HP_0000347	http://purl.obolibrary.org/obo/MP_0004592	lexical	Micrognathia (HPO)	small mandible (MPO)	HP	MP
4	http://purl.obolibrary.org/obo/HP_0000327	http://purl.obolibrary.org/obo/MP_0004540	lexical	Hypoplasia of the maxilla (HPO)	small maxilla (MPO)	HP	MP
	Ontology ID	EquivalentClasses
0	ID	EC %
1	http://purl.obolibrary.org/obo/HP_0000347	http://purl.obolibrary.org/obo/MP_0002639
2	http://purl.obolibrary.org/obo/MP_0002639	http://purl.obolibrary.org/obo/HP_0000347
3	http://purl.obolibrary.org/obo/MP_0004592	http://purl.obolibrary.org/obo/HP_0000347
4	http://purl.obolibrary.org/obo/HP_0000347	http://purl.obolibrary.org/obo/MP_0004592