From 9a3347f750e64e9605c0aa33b5a3f6b83ef3c634 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sat, 18 Jan 2025 11:38:34 +0200 Subject: [PATCH] Create upheno-paper.ipynb --- src/scripts/upheno-paper.ipynb | 860 +++++++++++++++++++++++++++++++++ 1 file changed, 860 insertions(+) create mode 100644 src/scripts/upheno-paper.ipynb diff --git a/src/scripts/upheno-paper.ipynb b/src/scripts/upheno-paper.ipynb new file mode 100644 index 00000000..fae70476 --- /dev/null +++ b/src/scripts/upheno-paper.ipynb @@ -0,0 +1,860 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import sys\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "profile = sys.argv[1]\n", + "profile = \"all\"\n", + "stopwords = ['abnormally','abnormal','aberrant','variant']\n", + "outdir = \"../curation/data\"\n", + "uphenorelease_dir = \"../curation/upheno-release/{}/\".format(profile)\n", + "\n", + "## IN\n", + "upheno_mapping_logical = os.path.join(uphenorelease_dir,\"upheno_mapping_logical.csv\")\n", + "upheno_species_lexical_file = os.path.join(uphenorelease_dir,\"upheno_species_lexical.csv\")\n", + "\n", + "## OUT\n", + "upheno_mapping_all = os.path.join(uphenorelease_dir,\"upheno_mapping_all.csv\")\n", + "upheno_mapping_lexical = os.path.join(uphenorelease_dir,\"upheno_mapping_lexical.csv\")\n", + "upheno_mapping_lexical_template = os.path.join(uphenorelease_dir,\"upheno_mapping_lexical_template.csv\")\n", + "upheno_mapping_problematic = os.path.join(uphenorelease_dir,\"upheno_mapping_problematic.csv\")\n", + "\n", + "## Load lexical data\n", + "df = pd.read_csv(upheno_species_lexical_file)\n", + "df.columns = ['iri','p','label']\n", + "\n", + "## Load logical mappings\n", + "dfl1 = pd.read_csv(upheno_mapping_logical)[['p1','p2']]\n", + "dfl2 = dfl1.copy()\n", + "dfl2.columns = ['p2','p1']\n", + "dfl = pd.concat([dfl1, dfl2], ignore_index=True, sort =False)\n", + "dfl = dfl.drop_duplicates()\n", + "dfl['cat']=\"logical\"\n", + "\n", + "## Prepare dataframe for labels\n", + "df_label = df[df['p']==\"http://www.w3.org/2000/01/rdf-schema#label\"][['iri','label']]\n", + "df_label.columns = ['iri','label']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
p1p2cat
0http://purl.obolibrary.org/obo/MP_0001634http://purl.obolibrary.org/obo/HP_0011029logical
1http://purl.obolibrary.org/obo/HP_0011373http://purl.obolibrary.org/obo/MP_0003148logical
2http://purl.obolibrary.org/obo/MP_0003769http://purl.obolibrary.org/obo/HP_0000159logical
3http://purl.obolibrary.org/obo/FBcv_0000708http://purl.obolibrary.org/obo/ZP_0009477logical
4http://purl.obolibrary.org/obo/WBPhenotype_000...http://purl.obolibrary.org/obo/ZP_0009477logical
\n", + "
" + ], + "text/plain": [ + " p1 \\\n", + "0 http://purl.obolibrary.org/obo/MP_0001634 \n", + "1 http://purl.obolibrary.org/obo/HP_0011373 \n", + "2 http://purl.obolibrary.org/obo/MP_0003769 \n", + "3 http://purl.obolibrary.org/obo/FBcv_0000708 \n", + "4 http://purl.obolibrary.org/obo/WBPhenotype_000... \n", + "\n", + " p2 cat \n", + "0 http://purl.obolibrary.org/obo/HP_0011029 logical \n", + "1 http://purl.obolibrary.org/obo/MP_0003148 logical \n", + "2 http://purl.obolibrary.org/obo/HP_0000159 logical \n", + "3 http://purl.obolibrary.org/obo/ZP_0009477 logical \n", + "4 http://purl.obolibrary.org/obo/ZP_0009477 logical " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfl.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
irilabel
0http://purl.obolibrary.org/obo/UPHENO_0001001Phenotype
1http://purl.obolibrary.org/obo/UPHENO_0012047Enhanced behavior process
2http://purl.obolibrary.org/obo/UPHENO_0055087increased thigmotaxis
3http://purl.obolibrary.org/obo/MP_0002797increased thigmotaxis (MPO)
4http://purl.obolibrary.org/obo/UPHENO_0054934increased vertical activity
\n", + "
" + ], + "text/plain": [ + " iri label\n", + "0 http://purl.obolibrary.org/obo/UPHENO_0001001 Phenotype\n", + "1 http://purl.obolibrary.org/obo/UPHENO_0012047 Enhanced behavior process\n", + "2 http://purl.obolibrary.org/obo/UPHENO_0055087 increased thigmotaxis\n", + "3 http://purl.obolibrary.org/obo/MP_0002797 increased thigmotaxis (MPO)\n", + "4 http://purl.obolibrary.org/obo/UPHENO_0054934 increased vertical activity" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_label.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "158626\n" + ] + } + ], + "source": [ + "# Preprocess labels. The most important aspect to this the stopword removal. this is done by matching a stopword\n", + "# that means 'abnormal', removing it and then adding the actual prefix 'abnormal'. For example, \"cell morphology, aberrant\"\n", + "# will become 'abnormal cell morphology'. Other than that, most special characters other than space and the ' tick-mark\n", + "# Are removed\n", + "\n", + "def apply_stopword(x, stopword):\n", + " if x:\n", + " if stopword in x:\n", + " x = \"abnormal \"+x.replace(stopword, '')\n", + " return x\n", + "\n", + "def preprocess_labels(df, stopwords):\n", + " df['label'] = df['label'].astype(str)\n", + " df['label_pp'] = df['label'].str.replace(r\"[(][A-Z]+[)]\", \"\")\n", + " df['label_pp'] = df['label_pp'].str.lower()\n", + " df['label_pp'] = df['label_pp'].str.replace(r\"[^0-9a-z' ]\", \"\")\n", + "\n", + " for stopword in stopwords:\n", + " df['label_pp'] = df['label_pp'].apply(lambda x: apply_stopword(x,stopword))\n", + "\n", + " df['label_pp'] = df['label_pp'].str.strip()\n", + " df['label_pp'] = df['label_pp'].str.replace(r\"[ ]+\", \" \")\n", + " df=df[~df['iri'].astype(str).str.startswith('http://purl.obolibrary.org/obo/UPHENO_')]\n", + " df=df[df['label_pp']!=\"\"]\n", + " d=df[['iri','label_pp']]\n", + " d.columns=['iri','label']\n", + " d=d.drop_duplicates()\n", + " return d\n", + "\n", + "d = preprocess_labels(df,stopwords)\n", + "l = df_label[~df_label['iri'].astype(str).str.startswith('http://purl.obolibrary.org/obo/UPHENO_')]\n", + "print(len(d))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "dd=d.groupby('label')['iri'].apply(list).to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----------------------\n", + "abnormal protein transport\n", + "['http://purl.obolibrary.org/obo/WBPhenotype_0001864', 'http://purl.obolibrary.org/obo/FYPO_0005744', 'http://purl.obolibrary.org/obo/PHIPO_0000265', 'http://purl.obolibrary.org/obo/FYPO_0000540']\n", + "-----------------------\n", + "abnormal rna localization\n", + "['http://purl.obolibrary.org/obo/FYPO_0003057', 'http://purl.obolibrary.org/obo/PHIPO_0000251', 'http://purl.obolibrary.org/obo/FYPO_0000360']\n", + "-----------------------\n", + "altered cellular cyclic amp level\n", + "['http://purl.obolibrary.org/obo/FYPO_0001659', 'http://purl.obolibrary.org/obo/PHIPO_0001022', 'http://purl.obolibrary.org/obo/PHIPO_0001023']\n", + "-----------------------\n", + "asd\n", + "['http://purl.obolibrary.org/obo/HP_0000729', 'http://purl.obolibrary.org/obo/HP_0001631', 'http://purl.obolibrary.org/obo/MP_0010403']\n", + "-----------------------\n", + "lethal\n", + "['http://purl.obolibrary.org/obo/WBPhenotype_0000062', 'http://purl.obolibrary.org/obo/FBcv_0000351', 'http://purl.obolibrary.org/obo/PHIPO_0000019', 'http://purl.obolibrary.org/obo/PHIPO_0000513']\n", + "70\n", + "5\n", + "154502\n" + ] + } + ], + "source": [ + "# This step is a complicated hack that tries to get rid of them of the false exact synonyms. \n", + "# The idea is this: if there is an exact synonym between two terms within an ontology, we get rid of the link. \n", + "# Sometimes, however, a synonym is shared between more than one term within and ontology and across: \n", + "# These cases need to be\n", + "\n", + "import re\n", + "\n", + "def get_dupes(a):\n", + " seen = {}\n", + " dupes = []\n", + "\n", + " for x in a:\n", + " if x not in seen:\n", + " seen[x] = 1\n", + " else:\n", + " if seen[x] == 1:\n", + " dupes.append(x)\n", + " seen[x] += 1\n", + " return dupes\n", + "\n", + "cases = dict()\n", + "cases_internal = dict()\n", + "i = 0\n", + "\n", + "exclude_synonyms = dict()\n", + "\n", + "for label in dd:\n", + " iris = dd.get(label)\n", + " onts = [re.sub('[_][0-9]+', '', iri.replace(\"http://purl.obolibrary.org/obo/\",\"\")) for iri in iris]\n", + " if len(onts)>1:\n", + " if len(onts) != len(set(onts)):\n", + " if len(set(onts))>1:\n", + " cases[label] = iris\n", + " print(\"-----------------------\")\n", + " print(label)\n", + " print(iris)\n", + " dupes = get_dupes(onts)\n", + " for dupe in dupes:\n", + " for iri in iris:\n", + " if dupe in iri:\n", + " if label not in exclude_synonyms:\n", + " exclude_synonyms[label]=[]\n", + " exclude_synonyms[label].append(iri)\n", + " else:\n", + " cases_internal[label] = iris\n", + " for iri in iris:\n", + " if label not in exclude_synonyms:\n", + " exclude_synonyms[label]=[]\n", + " exclude_synonyms[label].append(iri)\n", + "\n", + "\n", + "print(len(cases_internal))\n", + "print(len(cases))\n", + "print(len(dd))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "x = d" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "158626\n", + "158476\n", + "250437\n" + ] + } + ], + "source": [ + "# Remove all those IRIs that contained duplicates determined in the previous step\n", + "d=x\n", + "print(len(d))\n", + "for label in exclude_synonyms:\n", + " for iri in exclude_synonyms[label]:\n", + " d = d[~((d['iri']==iri) & (d['label']==label))]\n", + "print(len(d))\n", + "d = pd.merge(d,l,on=['iri','label'],how='outer')\n", + "print(len(d))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "dd=d.groupby('label')['iri'].apply(list).to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# \n", + "def pairwise(t):\n", + " it = iter(t)\n", + " return zip(it,it)\n", + "\n", + "def invert_dol_nonunique(d):\n", + " newdict = {}\n", + " for k in d:\n", + " for v in d[k]:\n", + " newdict.setdefault(v, []).append(k)\n", + " return newdict\n", + "\n", + "def merge_label_equivalent_cliques(dd_rv):\n", + " merge_labels = dict()\n", + " for iri in dd_rv:\n", + " labels_to_merge = dd_rv.get(iri)\n", + " if len(labels_to_merge)>1:\n", + " for lab in labels_to_merge:\n", + " if lab not in merge_labels:\n", + " merge_labels[lab] = []\n", + " merge_labels[lab] = list(set(merge_labels[lab]+labels_to_merge))\n", + " return merge_labels\n", + "\n", + "dd_rv = invert_dol_nonunique(dd)\n", + "merge_labels = merge_label_equivalent_cliques(dd_rv)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
irilabel
51713http://purl.obolibrary.org/obo/HP_0011138Abnormality of skin adnexa morphology (HPO)
\n", + "
" + ], + "text/plain": [ + " iri \\\n", + "51713 http://purl.obolibrary.org/obo/HP_0011138 \n", + "\n", + " label \n", + "51713 Abnormality of skin adnexa morphology (HPO) " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l[l['iri']==\"http://purl.obolibrary.org/obo/HP_0011138\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6612\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
p1p2catlabel_xlabel_yo1o2
0http://purl.obolibrary.org/obo/HP_0000347http://purl.obolibrary.org/obo/MP_0002639lexicalMicrognathia (HPO)micrognathia (MPO)HPMP
1http://purl.obolibrary.org/obo/MP_0002639http://purl.obolibrary.org/obo/HP_0000347lexicalmicrognathia (MPO)Micrognathia (HPO)MPHP
2http://purl.obolibrary.org/obo/MP_0004592http://purl.obolibrary.org/obo/HP_0000347lexicalsmall mandible (MPO)Micrognathia (HPO)MPHP
3http://purl.obolibrary.org/obo/HP_0000347http://purl.obolibrary.org/obo/MP_0004592lexicalMicrognathia (HPO)small mandible (MPO)HPMP
4http://purl.obolibrary.org/obo/HP_0000327http://purl.obolibrary.org/obo/MP_0004540lexicalHypoplasia of the maxilla (HPO)small maxilla (MPO)HPMP
\n", + "
" + ], + "text/plain": [ + " p1 \\\n", + "0 http://purl.obolibrary.org/obo/HP_0000347 \n", + "1 http://purl.obolibrary.org/obo/MP_0002639 \n", + "2 http://purl.obolibrary.org/obo/MP_0004592 \n", + "3 http://purl.obolibrary.org/obo/HP_0000347 \n", + "4 http://purl.obolibrary.org/obo/HP_0000327 \n", + "\n", + " p2 cat \\\n", + "0 http://purl.obolibrary.org/obo/MP_0002639 lexical \n", + "1 http://purl.obolibrary.org/obo/HP_0000347 lexical \n", + "2 http://purl.obolibrary.org/obo/HP_0000347 lexical \n", + "3 http://purl.obolibrary.org/obo/MP_0004592 lexical \n", + "4 http://purl.obolibrary.org/obo/MP_0004540 lexical \n", + "\n", + " label_x label_y o1 o2 \n", + "0 Micrognathia (HPO) micrognathia (MPO) HP MP \n", + "1 micrognathia (MPO) Micrognathia (HPO) MP HP \n", + "2 small mandible (MPO) Micrognathia (HPO) MP HP \n", + "3 Micrognathia (HPO) small mandible (MPO) HP MP \n", + "4 Hypoplasia of the maxilla (HPO) small maxilla (MPO) HP MP " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def compute_mappings(dd,l):\n", + " data = []\n", + " done = set()\n", + " for label in dd:\n", + " if label in done:\n", + " continue\n", + " done.add(label)\n", + " iris = dd.get(label)\n", + " if label in merge_labels:\n", + " for lab in merge_labels[label]:\n", + " iris.extend(dd.get(lab))\n", + " done.add(lab)\n", + " iris = list(set(iris))\n", + " if len(iris)>1:\n", + " #print(iris)\n", + " pairs = pairwise(iris)\n", + " for pair in pairs:\n", + " data.append([pair[0], pair[1]])\n", + " data.append([pair[1], pair[0]])\n", + " df_mappings = pd.DataFrame.from_records(data)\n", + " df_mappings = df_mappings.drop_duplicates()\n", + " df_mappings['cat'] = 'lexical'\n", + " df_mappings.columns = ['p1','p2','cat']\n", + " df_maps = pd.merge(df_mappings,l, how='left', left_on=['p1'], right_on=['iri'])\n", + " df_maps=df_maps.drop('iri',1)\n", + " df_maps = pd.merge(df_maps, l, how='left', left_on=['p2'], right_on=['iri'])\n", + " df_maps=df_maps.drop('iri',1)\n", + " df_maps['o1']=[re.sub('[_][0-9]+', '', iri.replace(\"http://purl.obolibrary.org/obo/\",\"\")) for iri in df_maps['p1'].values]\n", + " df_maps['o2']=[re.sub('[_][0-9]+', '', iri.replace(\"http://purl.obolibrary.org/obo/\",\"\")) for iri in df_maps['p2'].values]\n", + " return df_maps\n", + "\n", + "df_mapping = compute_mappings(dd,l)\n", + "print(len(df_mapping))\n", + "df_mapping.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "128\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Ontology IDEquivalentClasses
0IDEC %
1http://purl.obolibrary.org/obo/HP_0000347http://purl.obolibrary.org/obo/MP_0002639
2http://purl.obolibrary.org/obo/MP_0002639http://purl.obolibrary.org/obo/HP_0000347
3http://purl.obolibrary.org/obo/MP_0004592http://purl.obolibrary.org/obo/HP_0000347
4http://purl.obolibrary.org/obo/HP_0000347http://purl.obolibrary.org/obo/MP_0004592
\n", + "
" + ], + "text/plain": [ + " Ontology ID \\\n", + "0 ID \n", + "1 http://purl.obolibrary.org/obo/HP_0000347 \n", + "2 http://purl.obolibrary.org/obo/MP_0002639 \n", + "3 http://purl.obolibrary.org/obo/MP_0004592 \n", + "4 http://purl.obolibrary.org/obo/HP_0000347 \n", + "\n", + " EquivalentClasses \n", + "0 EC % \n", + "1 http://purl.obolibrary.org/obo/MP_0002639 \n", + "2 http://purl.obolibrary.org/obo/HP_0000347 \n", + "3 http://purl.obolibrary.org/obo/HP_0000347 \n", + "4 http://purl.obolibrary.org/obo/MP_0004592 " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Step to investigate why there are mappings of terms within the same ontology.. \n", + "## Since exact synonyms and labels were used, no such mapping should exist\n", + "## We drop them\n", + "\n", + "w=df_mapping[df_mapping['o1']==df_mapping['o2']]\n", + "df_maps = df_mapping[df_mapping['o1']!=df_mapping['o2']]\n", + "print(len(w))\n", + "w.to_csv(upheno_mapping_problematic,index=False)\n", + "#df_maps\n", + "# print(df_mapping[df_mapping['p1']==\"http://purl.obolibrary.org/obo/ZP_0006897\"])\n", + "\n", + "df_mapping_template = df_mapping[['p1','p2']].copy()\n", + "df_mapping_template.columns = ['Ontology ID','EquivalentClasses']\n", + "\n", + "df_mapping_template.loc[-1] = ['ID', 'AI obo:UPHENO_0000002'] # adding a row\n", + "df_mapping_template.index = df_mapping_template.index + 1 # shifting index\n", + "df_mapping_template.sort_index(inplace=True) \n", + "\n", + "df_mapping.to_csv(upheno_mapping_lexical,index=False)\n", + "df_mapping_template.to_csv(upheno_mapping_lexical_template,index=False)\n", + "\n", + "w[['p1','p2']]\n", + "df_mapping_template.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Merging the logical mappings with the lexical ones for comparison\n", + "print(df_maps.head())\n", + "df_m = pd.merge(df_maps[['p1','p2','cat']], dfl, how='outer', on=['p1','p2'])\n", + "df_m = pd.merge(df_m,l, how='left', left_on=['p1'], right_on=['iri'])\n", + "df_m=df_m.drop('iri',1)\n", + "df_m = pd.merge(df_m, l, how='left', left_on=['p2'], right_on=['iri'])\n", + "df_m=df_m.drop('iri',1)\n", + "df_m['cat'] = df_m[\"cat_x\"].astype(str)+\"-\" + df_m[\"cat_y\"].astype(str)\n", + "df_m['cat'] = df_m['cat'].str.replace(\"-nan\", \"\")\n", + "df_m['cat'] = df_m['cat'].str.replace(\"nan-\", \"\")\n", + "df_m=df_m.drop('cat_x',1)\n", + "df_m=df_m.drop('cat_y',1)\n", + "\n", + "print(df_m['cat'].value_counts(normalize=True))\n", + "print(df_m['cat'].value_counts())\n", + "\n", + "df_m.to_csv(upheno_mapping_all,index=False)\n", + "\n", + "df_m.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.5" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}