From 0bb8f9f3d715aae786b449e3bd52b810282213de Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Thu, 31 Oct 2019 07:55:14 -0700 Subject: [PATCH 01/16] removing case conflict --- .../Tutorials/interlex_remotes_tutorial.ipynb | 742 ------------------ 1 file changed, 742 deletions(-) delete mode 100644 ilxutils/Tutorials/interlex_remotes_tutorial.ipynb diff --git a/ilxutils/Tutorials/interlex_remotes_tutorial.ipynb b/ilxutils/Tutorials/interlex_remotes_tutorial.ipynb deleted file mode 100644 index f596fda2..00000000 --- a/ilxutils/Tutorials/interlex_remotes_tutorial.ipynb +++ /dev/null @@ -1,742 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# INSTALL\n", - "- WARNING ::: ONLY DO ONCE \n", - " - update devconfig in ~/.config/pyontutils/devonfig.yaml\n", - " - scigraph_api: http://scigraph.scicrunch.io:9000/scigraph\n", - " - Install both pyontutils and ilxutils with pyontutils\n", - " - cd ~/git/pyontutils\n", - " - pip3 install --user --editable .\n", - " - cd ~/git/pyontutils/ilxutils/\n", - " - pip3 install --user --editable .\n", - " - Clone ontquery and install\n", - " - cd ~/git\n", - " - git clone https://github.com/tgbugs/ontquery.git\n", - " - cd ~/git/ontquery\n", - " - pip3 install --user --editable ." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Maintainance\n", - "- update repos\n", - " - cd ~/git/pyontutils\n", - " - git pull \n", - " - cd ~/git/ontquery\n", - " - git pull" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Google Sheets Import\n", - "### Need pyontutils secrets.yaml setup first!" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
termcurie
1abdominal cavityUBERON:0003684
2abdominal wallUBERON:0003697
3adipose tissueUBERON:0001013
4adult organismUBERON:0007023
5alimentary part of gastrointestinal systemUBERON:0005409
\n", - "
" - ], - "text/plain": [ - "0 term curie\n", - "1 abdominal cavity UBERON:0003684\n", - "2 abdominal wall UBERON:0003697\n", - "3 adipose tissue UBERON:0001013\n", - "4 adult organism UBERON:0007023\n", - "5 alimentary part of gastrointestinal system UBERON:0005409" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyontutils.sheets import Sheet\n", - "import pandas as pd\n", - "KEY_NAME = 'sparc-terms'\n", - "SHEET_NAME = 'Minimal information model(MIS)'\n", - "\n", - "class Brainstem(Sheet):\n", - " name = KEY_NAME # key name you gave the google sheet id value in secrets.yaml\n", - " sheet_name = SHEET_NAME # the actual sheet name on the google sheet\n", - " fetch_grid = True # meta data in self.grid that has detials like bolding\n", - "\n", - "brainstem = Brainstem()\n", - "df = pd.DataFrame(brainstem.raw_values)\n", - "df.columns = df.iloc[0]\n", - "df.drop(df.index[0], inplace=True)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['abdominal cavity',\n", - " 'abdominal wall',\n", - " 'adipose tissue',\n", - " 'adult organism',\n", - " 'alimentary part of gastrointestinal system',\n", - " 'arterial blood',\n", - " 'biceps femoris',\n", - " 'blood',\n", - " 'bolus of food',\n", - " 'brainstem']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(df.term)[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# CSV or TSV EXAMPLE" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import pandas as pd\n", - "\n", - "csv_df = pd.DataFrame('/path/to/csv')\n", - "tsv_df = pd.DataFrame('/path/to/tsv', delimiter='\\t')\n", - "\n", - "csv_df.head() # returns top 5 rows\n", - "csv_df.column_name # specific column name will return a Series which will act like a list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# QUERY DATABASES " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[OntTerm('HBA:3999', label='brain (hba)'),\n", - " OntTerm('FMA:50801', label='Brain'),\n", - " OntTerm('UBERON:0000955', label='brain'),\n", - " OntTerm('UBERON:6110636', label='adult cerebral ganglion'),\n", - " OntTerm('ILX:0101431', label='Brain'),\n", - " OntTerm('ILX:0101433', label='Brain Infarction'),\n", - " OntTerm('ILX:0506386', label='Brain Aneurysm'),\n", - " OntTerm('ILX:0433050', label='Brain Chemistry'),\n", - " OntTerm('ILX:0641746', label='alpha BRAIN'),\n", - " OntTerm('ILX:0726394', label='brain meninx'),\n", - " OntTerm('ILX:0729002', label='brain commissure'),\n", - " OntTerm('ILX:0101434', label='Brain Ischemia'),\n", - " OntTerm('ILX:0461406', label='Brain Death'),\n", - " OntTerm('ILX:0733041', label='brain endothelium')]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Give \"query\" a usable parameter to query the databases \n", - "from pyontutils.core import query # OntTerm\n", - "query(term='brain')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[OntTerm('ILX:0103358', label='DN1 neuron'),\n", - " OntTerm('ILX:0109525', label='Pupal DN1 period neuron')]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# similar entities will show\n", - "# default limit is 10\n", - "query(term='DN1 neuron', limit=2) " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[OntTerm('UBERON:0000955', label='brain')]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Faster and more accurate with curie/iri\n", - "query(curie='UBERON:0000955')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'prefix': 'UBERON',\n", - " 'suffix': '0000955',\n", - " 'orig_kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", - " 'curie_or_iri': None,\n", - " 'label': None,\n", - " 'term': None,\n", - " 'search': None,\n", - " 'validated': None,\n", - " 'query': None},\n", - " 'kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", - " 'curie_or_iri': None,\n", - " 'label': None,\n", - " 'term': None,\n", - " 'search': None,\n", - " 'validated': None,\n", - " 'query': None},\n", - " 'label': 'brain',\n", - " 'labels': ['brain'],\n", - " 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].',\n", - " 'synonyms': ['the brain',\n", - " 'synganglion',\n", - " 'suprasegmental structures',\n", - " 'suprasegmental levels of nervous system',\n", - " 'encephalon'],\n", - " 'deprecated': False,\n", - " 'predicates': {},\n", - " '_type': OntId('owl:Class'),\n", - " '_types': (OntId('owl:Class'),),\n", - " '_graph': None,\n", - " '_source': ,\n", - " 'validated': True,\n", - " '_query_result': QueryResult({'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955', 'curie': 'UBERON:0000955', 'label': 'brain', 'labels': ['brain'], 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].', 'synonyms': ['the brain', 'synganglion', 'suprasegmental structures', 'suprasegmental levels of nervous system', 'encephalon'], 'deprecated': False, 'predicates': {}, 'type': OntId('owl:Class'), 'types': (OntId('owl:Class'),), '_graph': None, 'source': })}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "entity = query(curie='UBERON:0000955')[0]\n", - "# Full result attribute\n", - "vars(entity)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DEBUGGING HINT\n", - "- 1 \"?\" at the end of a function or class will return its params, docstring, and pathing. \n", - "- 2 \"??\" returns the ENTIRE class/functions " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mCall signature:\u001b[0m\n", - "\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mterm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mprefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcategory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mabbrev\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0msearch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0msuffix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcurie\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0miri\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mpredicates\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mexclude_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdepth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdirection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'OUTGOING'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0minclude_deprecated\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mType:\u001b[0m OntQueryCli\n", - "\u001b[0;31mString form:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/query.py\n", - "\u001b[0;31mDocstring:\u001b[0m \n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "query?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BONUS!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Concurrently search! (Run multiple query functions at the same time)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Futures compiled\n" - ] - }, - { - "data": { - "text/plain": [ - "[({'curie': 'UBERON:0000955'}, [OntTerm('UBERON:0000955', label='brain')]),\n", - " ({'curie': 'UBERON:6110636'},\n", - " [OntTerm('UBERON:6110636', label='adult cerebral ganglion')])]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyontutils.utils import Async, deferred\n", - "from pyontutils.core import OntTerm, ixr, query\n", - "from typing import List, Tuple\n", - "\n", - "# query.setup()\n", - "\n", - "def queries(kwargs_list:List[dict]) -> List[Tuple[str, dict]]:\n", - " '''Asynchronously query databases to dramatically increase runtime un users end \n", - " \n", - " Examples:\n", - " >>> queries([{'term':'Brain'},])\n", - " [({'term': 'Brain'},\n", - " [OntTerm('HBA:3999', label='brain (hba)'),\n", - " OntTerm('FMA:50801', label='Brain'),\n", - " OntTerm('UBERON:0000955', label='brain'),\n", - " OntTerm('UBERON:6110636', label='adult cerebral ganglion')])]\n", - " >>> queries([{'curie':'UBERON:0000955'},])\n", - " [({'curie': 'UBERON:0000955'}, [OntTerm('UBERON:0000955', label='brain')])]\n", - " \n", - " Definitions:\n", - " kwargs == common name given to dictionary input for function\n", - " tuple == a list that you cannot update. \n", - " lambda == short-hand for single line function creation (func = lambda : ) \n", - " \n", - " Args:\n", - " kwargs_list (list): A list of dictionaries that are paramaters for the query function\n", - " \n", - " Returns:\n", - " List[tuple]: A list of tuples all being of (, ). \n", - " '''\n", - " # create a query function wrapper to return tuple\n", - " # kwargs -> (kwargs, query_result)\n", - " # We do this in case 2+ queries return the same results & the output WILL NOT have the same input order\n", - " gin = lambda kwargs: (kwargs, query(**kwargs))\n", - " # run each query instance at the same time\n", - " results = Async(use_nest_asyncio=True)(deferred(gin)(kwargs) for kwargs in kwargs_list)\n", - " return results \n", - "\n", - "queries([{'curie':'UBERON:0000955'}, {'curie':'UBERON:6110636'}])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyontutils.utils import Async, deferred\n", - "from pyontutils.core import OntTerm, ixr, query\n", - "from typing import List, Tuple\n", - "def queries(url_list:List[dict]) -> List[Tuple[str, dict]]:\n", - " def gin(url):\n", - " return requests.get(url).text\n", - " # run each query instance at the same time\n", - " results = Async(limit=5)(deferred(gin)(url) for url in url_list)\n", - " return results \n", - "list_tuples(url, html)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Modifing TEST InterLex" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# TEST InterLex endpoints\n", - "from ilxutils.remotes import interlex_remote_test as ixrt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# GET ENTITY" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '661544',\n", - " 'orig_uid': '34142',\n", - " 'uid': '34142',\n", - " 'orig_cid': '0',\n", - " 'cid': '0',\n", - " 'ilx': 'ilx_0738390',\n", - " 'label': 'Offical label',\n", - " 'type': 'term',\n", - " 'definition': 'official definition',\n", - " 'comment': 'helpful misc',\n", - " 'version': '3',\n", - " 'status': '0',\n", - " 'display_superclass': '1',\n", - " 'orig_time': '1564695195',\n", - " 'time': '1570826848',\n", - " 'synonyms': [{'id': '1776645',\n", - " 'tid': '661544',\n", - " 'literal': 'Encephalon',\n", - " 'type': '',\n", - " 'time': '1570826848',\n", - " 'version': '3'},\n", - " {'id': '1776646',\n", - " 'tid': '661544',\n", - " 'literal': 'Cerebro',\n", - " 'type': '',\n", - " 'time': '1570826848',\n", - " 'version': '3'}],\n", - " 'superclasses': [],\n", - " 'existing_ids': [{'id': '3885545',\n", - " 'tid': '661544',\n", - " 'curie': 'ILX:0738390',\n", - " 'iri': 'http://uri.interlex.org/base/ilx_0738390',\n", - " 'curie_catalog_id': '3885424',\n", - " 'version': '3',\n", - " 'time': '1570826848',\n", - " 'preferred': '1'}],\n", - " 'relationships': [],\n", - " 'mappings': [],\n", - " 'annotations': [],\n", - " 'ontologies': []}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ixrt.ilx_cli.get_entity('tmp_0738390')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ADD" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m\n", - "\u001b[0mixrt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_entity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0msubThingOf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdefinition\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mpredicates\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", - "\u001b[0;31mType:\u001b[0m method\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "ixrt.add_entity?" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'official test label', 'labels': (), 'definition': 'definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" - ] - } - ], - "source": [ - "entity = dict(\n", - " label = 'offical label', # Can only one unique label per person\n", - " type = 'term', # OPTIONS: term, annotation, relationship, cde, fde, pde\n", - " definition = 'official definition',\n", - " comment = 'helpful misc',\n", - " # Optional\n", - " subThingOf = '', # WARNING ::: must have at last '', can be blank but please fill this in if you can. \n", - " synonyms = ['Encephalon', 'Cerebro'],\n", - " predicates = {} # annotations and/or relationships to add\n", - " # TODO: existing_ids will be an option later\n", - ")\n", - "result = ixrt.add_entity(**entity)\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# UPDATE" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m\n", - "\u001b[0mixrt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_entity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0milx_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0msubThingOf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdefinition\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_add\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_delete\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", - "\u001b[0;31mType:\u001b[0m method\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "ixrt.update_entity?" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m[2019-10-11 13:47:28,619]\u001b[0m - \u001b[32m INFO\u001b[0m - ontquery - \u001b[34minterlex_client.py:796 \u001b[0m - {'ilx_id': 'ILX:0738390', 'label': 'Offical label', 'type': 'term', 'definition': 'official definition', 'comment': 'helpful misc', 'superclass': '', 'synonyms': ['Encephalon', 'Cerebro']}\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'Offical label', 'labels': (), 'definition': 'official definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" - ] - } - ], - "source": [ - "from ilxutils.remotes import interlex_remote_test as ixrt\n", - "entity = dict(\n", - " ilx_id = 'ILX:0738390',\n", - " label = 'Offical label', # Can only one unique label per person\n", - " type = 'term', # OPTIONS: term, annotation, relationship, cde, fde, pde\n", - " definition = 'official definition',\n", - " comment = 'helpful misc',\n", - " # Optional\n", - " subThingOf = '', # WARNING ::: must have at last '', can be blank but please fill this in if you can. \n", - " synonyms = ['Encephalon', 'Cerebro'],\n", - " predicates_to_add = {}, # annotations and/or relationships to add\n", - " predicates_to_delete = {}, # annotations and/or relationships to del\n", - " # TODO: existing_ids will be an option later\n", - ")\n", - "result = ixrt.update_entity(**entity)\n", - "print(result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PRODUCTION \n", - "# BE CAREFUL PLEASE :)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# PRODUCTION \n", - "from ilxutils.remotes import interlex_remote_production as ixr\n", - "# BE CAREFUL :)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 125b51c585b831ec0f69d4eabdd303b96e8cfc0a Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Tue, 5 Nov 2019 10:03:45 -0800 Subject: [PATCH 02/16] new elastic search wrapper --- ilxutils/ilxutils/elasticsearch_wrapper.py | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 ilxutils/ilxutils/elasticsearch_wrapper.py diff --git a/ilxutils/ilxutils/elasticsearch_wrapper.py b/ilxutils/ilxutils/elasticsearch_wrapper.py new file mode 100644 index 00000000..ee9e8d76 --- /dev/null +++ b/ilxutils/ilxutils/elasticsearch_wrapper.py @@ -0,0 +1,107 @@ +from functools import wraps +import json +import os +import subprocess +import docopt +from elasticsearch import Elasticsearch +BASHRC = lambda s: os.environ.get(s) + + +class ElasticSearchTools: + """ Shortcuts for common elasticsearch querys. """ + + def __init__(self, + host: str, index: str, type: str, + user: str, password: str, + size: int = 10, start: int = 0, + scheme: str = 'https',) -> None: + """ + :param str url: ElasticSearch url endpoint. + :param str index: + """ + self.url = f'{scheme}://{host}/{index}' + self.host, self.index, self.type = host, index, type + self.es = Elasticsearch(self.url, http_auth=(user, password)) + + def search(self, body: dict, **kwargs) -> dict: + """ Elasticsearch '/_search' feature. + + We use a framented index called a type. The type is the last index + while the real index becomes part of the host url. + + :param dict body: query dict. + :return: nested elasticsearch dict where hits are in ['hits']['hits'] + + >>>__search(body={ 'query': { 'match_all': {} } }) + """ + return self.es.search(index=self.type, body=body, **kwargs) + + def all_matches(self, sorting: str, size, start) -> dict: + """First or last set of entities. + + :param str sorting: asc for head or desc for tail. + :param int size: number of entities you want from head or tails. + :param int start: position of index you want to start from. + :return: elasticsearch _search dict + """ + if sorting.lower().strip() not in ['asc', 'desc']: + raise ValueError('sorting can only be asc or desc.') + body = { + 'query': { 'match_all': {} }, + 'sort': [ { '_id': sorting } ], + 'size': size, + 'from': start, + } + return self.search(body) + + def head(self, size=10, start=0): + """ See __end doc. """ + return self.all_matches(sorting='asc', size=size, start=start) + + def tail(self, size=10, start=0): + """ See __end doc. """ + return self.all_matches(sorting='desc', size=size, start=start) + + +class InterLexES(ElasticSearchTools): + + def __init__(self, beta=True): + super().__init__( + host = BASHRC('INTERLEX_ELASTIC_URL'), + index = 'interlex', + type = 'term', + user = BASHRC('INTERLEX_ELASTIC_USER'), + password = BASHRC('INTERLEX_ELASTIC_PASSWORD'), + ) + self.beta = beta + + def filter_tmp(self): + prefix = 'tmp_' if self.beta else 'ilx_' + return { 'prefix': { 'ilx' : { 'value': prefix } } } + + def all_matches(self, sorting: str, size, start) -> dict: + """First or last set of entities. + + :param str sorting: asc for head or desc for tail. + :param int size: number of entities you want from head or tails. + :param int start: position of index you want to start from. + :return: elasticsearch _search dict + """ + if sorting.lower().strip() not in ['asc', 'desc']: + raise ValueError('sorting can only be asc or desc.') + body = { + 'query': self.filter_tmp(), + 'sort': [ { '_id': sorting } ], + 'size': size, + 'from': start, + } + return self.search(body) + + +def main(): + ilxes = InterLexES(beta=False) + print(ilxes.tail(1)) + + +if __name__ == '__main__': + main() From c7c71dfdd789a552a87b47f6d789987b8b227874 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Fri, 13 Dec 2019 12:40:32 -0800 Subject: [PATCH 03/16] shortcut to sql and update print on ontopandas --- ilxutils/ilxutils/ontopandas.py | 6 ++++++ ilxutils/ilxutils/sql.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 ilxutils/ilxutils/sql.py diff --git a/ilxutils/ilxutils/ontopandas.py b/ilxutils/ilxutils/ontopandas.py index faf2581b..f777863b 100644 --- a/ilxutils/ilxutils/ontopandas.py +++ b/ilxutils/ilxutils/ontopandas.py @@ -43,10 +43,12 @@ class OntoPandas: def __init__(self, obj: Union[rdflib.graph.Graph, str], query:str=defaultquery, + curie:bool=True, qnamed:bool=False, str_vals:bool=False,) -> None: self.query = query self.qnamed = qnamed + self.curie = curie self.str_vals = str_vals self.g = obj # could be path self.path = obj # could be graph @@ -270,6 +272,10 @@ def get_sparql_dataframe( self ): df = df.where((pd.notnull(df)), None) # default Null is fricken Float NaN df = df.reset_index().rename(columns={'index':'iri'}) + + if self.curie: + df['curie'] = df.apply(lambda row: self.qname(row.iri), axis = 1) + return df diff --git a/ilxutils/ilxutils/sql.py b/ilxutils/ilxutils/sql.py new file mode 100644 index 00000000..c4176a71 --- /dev/null +++ b/ilxutils/ilxutils/sql.py @@ -0,0 +1,16 @@ +from interlex_sql import IlxSql +import os + +def production_sql(from_backup=True): + return IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_PRODUCTION'), from_backup=from_backup) + +def beta_sql(from_backup=True): + # TEST{#} should be a thing since this still relies on main sql test + return IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_BETA'), from_backup=from_backup) + +# entities = [] +# for ilx, group in ex.groupby('ilx'): +# if not any(list(group['preferred'] == '1')): +# entities.append(ilx) + +# from ontquery.plugins.services.interlex_client import InterLexClient From 4559c51ec0ef9e3b75b1cfcc7a31b67296cac9da Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Tue, 7 Jan 2020 11:45:33 -0800 Subject: [PATCH 04/16] this was lost --- pyontutils/ontutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyontutils/ontutils.py b/pyontutils/ontutils.py index 0be7935b..f4fc60d1 100755 --- a/pyontutils/ontutils.py +++ b/pyontutils/ontutils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3.7 #!/usr/bin/env pypy3 -from pyontutils.config import auth +from pyontutils.config import auth, devconfig __doc__ = f"""Common commands for ontology processes. Also old ontology refactors to run in the root ttl folder. From a27a253e9f56b863814f570a0fd7ed6289672e04 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Tue, 7 Jan 2020 11:48:00 -0800 Subject: [PATCH 05/16] update ilx references --- ilxutils/ilx-playground.ipynb | 16 ++-- .../tutorials/interlex_remotes_tutorial.ipynb | 82 ++++--------------- 2 files changed, 21 insertions(+), 77 deletions(-) diff --git a/ilxutils/ilx-playground.ipynb b/ilxutils/ilx-playground.ipynb index 7f8afdfb..7072d902 100644 --- a/ilxutils/ilx-playground.ipynb +++ b/ilxutils/ilx-playground.ipynb @@ -1,12 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 33, @@ -681,7 +674,12 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [ { "name": "stderr", @@ -6477,7 +6475,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.5" } }, "nbformat": 4, diff --git a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb index f596fda2..85b28da4 100644 --- a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb +++ b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb @@ -42,76 +42,22 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
termcurie
1abdominal cavityUBERON:0003684
2abdominal wallUBERON:0003697
3adipose tissueUBERON:0001013
4adult organismUBERON:0007023
5alimentary part of gastrointestinal systemUBERON:0005409
\n", - "
" - ], - "text/plain": [ - "0 term curie\n", - "1 abdominal cavity UBERON:0003684\n", - "2 abdominal wall UBERON:0003697\n", - "3 adipose tissue UBERON:0001013\n", - "4 adult organism UBERON:0007023\n", - "5 alimentary part of gastrointestinal system UBERON:0005409" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'exists'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mfetch_grid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m \u001b[0;31m# meta data in self.grid that has detials like bolding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mbrainstem\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBrainstem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbrainstem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dropbox/git/pyontutils/pyontutils/sheets.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, sheet_name, fetch_grid, readonly)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadonly\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreadonly\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 210\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 211\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dropbox/git/pyontutils/pyontutils/sheets.py\u001b[0m in \u001b[0;36m_setup\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadonly\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSheet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__spreadsheet_service_ro'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 216\u001b[0;31m \u001b[0mservice\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_oauth_service\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreadonly\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadonly\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# I think it is correct to keep this ephimoral\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 217\u001b[0m \u001b[0mSheet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__spreadsheet_service_ro\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspreadsheets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dropbox/git/pyontutils/pyontutils/sheets.py\u001b[0m in \u001b[0;36mget_oauth_service\u001b[0;34m(api, version, readonly, SCOPES)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0mstore_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mauth\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_auth_var\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mstore_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstore_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'exists'" + ] } ], "source": [ @@ -734,7 +680,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" } }, "nbformat": 4, From 3c72023e7d73a98f2508e58a317cc69bf5e69ff8 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Mon, 24 Feb 2020 16:55:40 -0800 Subject: [PATCH 06/16] updates --- ilxutils/ilxutils/backup_ilx.py | 10 ++-- ilxutils/ilxutils/interlex_sql.py | 11 ++-- ilxutils/ilxutils/nltklib.py | 3 +- ilxutils/ilxutils/remotes.py | 29 +++++----- ilxutils/ilxutils/scicrunch_session.py | 74 ++++++++++++++++++++++++++ ilxutils/ilxutils/sql.py | 2 +- 6 files changed, 106 insertions(+), 23 deletions(-) create mode 100644 ilxutils/ilxutils/scicrunch_session.py diff --git a/ilxutils/ilxutils/backup_ilx.py b/ilxutils/ilxutils/backup_ilx.py index a4b102f7..2a94a962 100644 --- a/ilxutils/ilxutils/backup_ilx.py +++ b/ilxutils/ilxutils/backup_ilx.py @@ -1,9 +1,13 @@ from pathlib import Path as p -from ilxutils.interlex_sql import IlxSql -from ilxutils.tools import create_pickle +from interlex_sql import IlxSql +# from tools import create_pickle +import pickle import os - +def create_pickle(data, outfilename): + with open(outfilename, 'wb') as outfile: + pickle.dump(data, outfile) + sql = IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_PRODUCTION')) diff --git a/ilxutils/ilxutils/interlex_sql.py b/ilxutils/ilxutils/interlex_sql.py index 256a0f83..ce45497b 100755 --- a/ilxutils/ilxutils/interlex_sql.py +++ b/ilxutils/ilxutils/interlex_sql.py @@ -73,7 +73,7 @@ def get_terms(self): return self.terms engine = create_engine(self.db_url) data = """ - SELECT t.id as tid, t.ilx, t.label, t.definition, t.type, t.comment, t.version, t.uid, t.time + SELECT t.id as tid, t.ilx, t.label, t.definition, t.type, t.comment, t.version, t.uid, t.cid, t.time, t.status FROM terms t GROUP BY t.ilx """ @@ -118,7 +118,7 @@ def get_existing_ids(self): return self.existing_ids engine = create_engine(self.db_url) data = """ - SELECT tei.tid, tei.curie, tei.iri, tei.preferred, t.ilx, t.label, t.definition + SELECT tei.tid, tei.curie, tei.iri, tei.preferred, t.ilx, t.label, t.definition, t.status FROM ( SELECT * FROM terms @@ -140,24 +140,27 @@ def get_relationships(self): engine = create_engine(self.db_url) data = """ SELECT - t1.id as term1_tid, t1.ilx AS term1_ilx, t1.type as term1_type, - t2.id as term2_tid, t2.ilx AS term2_ilx, t2.type as term2_type, + t1.id as term1_tid, t1.ilx AS term1_ilx, t1.type as term1_type, t1.label as term1_label, + t2.id as term2_tid, t2.ilx AS term2_ilx, t2.type as term2_type, t2.label as term2_label, t3.id as relationship_tid, t3.ilx AS relationship_ilx, t3.label as relationship_label FROM term_relationships AS tr JOIN ( SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) t1 ON t1.id = tr.term1_id JOIN ( SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) AS t2 ON t2.id = tr.term2_id JOIN ( SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) AS t3 ON t3.id = tr.relationship_tid """ self.relationships = pd.read_sql(data, engine) diff --git a/ilxutils/ilxutils/nltklib.py b/ilxutils/ilxutils/nltklib.py index da5c9918..27acae74 100644 --- a/ilxutils/ilxutils/nltklib.py +++ b/ilxutils/ilxutils/nltklib.py @@ -137,7 +137,8 @@ def sentence_similarity(sentence1, sentence2, ignore_integers=False): tokens2 = word_tokenize(sentence2) tokens1 = clean_tokens(tokens1, ignore_integers) tokens2 = clean_tokens(tokens2, ignore_integers) - + print(tokens1) + print(tokens2) # tag sentence1 = pos_tag(tokens1) sentence2 = pos_tag(tokens2) diff --git a/ilxutils/ilxutils/remotes.py b/ilxutils/ilxutils/remotes.py index 4137adb1..711766c2 100644 --- a/ilxutils/ilxutils/remotes.py +++ b/ilxutils/ilxutils/remotes.py @@ -2,19 +2,20 @@ from pyontutils.core import OntTerm import os -TEST = 'https://test3.scicrunch.org/api/1/' -PRODUCTION = 'https://scicrunch.org/api/1/' +def remote(server=''): -InterLexRemote = oq.plugin.get('InterLex') -interlex_remote_production = InterLexRemote( - # When ready, should be changed to 'https://scicrunch.org/api/1/' for production - apiEndpoint = PRODUCTION -) -interlex_remote_production.setup(instrumented=OntTerm) + # Request interlex remote (scigraph is also an option for plugins) + InterLexRemote = oq.plugin.get('InterLex') -# InterLexRemote = oq.plugin.get('InterLex') -# interlex_remote_test = InterLexRemote( -# # When ready, should be changed to 'https://scicrunch.org/api/1/' for production -# apiEndpoint = TEST -# ) -# interlex_remote_test.setup(instrumented=OntTerm) + if server: + server = server if server.endswith('.') else server + '.' + endpoint = f'https://{server}scicrunch.org/api/1/' + + # + interlex_remote = InterLexRemote() + + # setup inheritance classes + interlex_remote.setup(instrumented=OntTerm) + interlex_remote.apiEndpoint = endpoint + + return interlex_remote diff --git a/ilxutils/ilxutils/scicrunch_session.py b/ilxutils/ilxutils/scicrunch_session.py new file mode 100644 index 00000000..b1b0bfa2 --- /dev/null +++ b/ilxutils/ilxutils/scicrunch_session.py @@ -0,0 +1,74 @@ +import json +import requests +from typing import Union, Dict, List +from urllib.parse import urljoin + + +class ScicrunchSession: + """ Boiler plate for SciCrunch server responses. """ + + def __init__(self, + key: str, + host: str = 'scicrunch.org', + auth: tuple = (None, None)) -> None: + """ Initialize Session with SciCrunch Server. + + :param str key: API key for SciCrunch [should work for test hosts]. + :param str host: Base url for hosting server [can take localhost:8080]. + :param str user: username for test server. + :param str password: password for test server. + """ + self.key = key + self.host = host + + # https is only for security level environments + if self.host.startswith('localhost'): + self.api = "http://" + self.host + '/api/1/' + else: + self.api = "https://" + self.host + '/api/1/' + + self.session = requests.Session() + self.session.auth = auth + self.session.headers.update({'Content-type': 'application/json'}) + + def __session_shortcut(self, endpoint: str, data: dict, session_type: str = 'GET') -> dict: + """ Short for both GET and POST. + + Will only crash if success is False or if there a 400+ error. + """ + def _prepare_data(data: dict) -> dict: + """ Check if request data inputed has key and proper format. """ + if data is None: + data = {'key': self.key} + elif isinstance(data, dict): + data.update({'key': self.key}) + else: + raise ValueError('request session data must be of type dictionary') + return json.dumps(data) + + url = urljoin(self.api, endpoint) + data = _prepare_data(data) + try: + # TODO: Could use a Request here to shorten code. + if session_type == 'GET': + response = self.session.get(url, data=data) + else: + response = self.session.post(url, data=data) + # crashes if success on the server side is False + if not response.json()['success']: + raise ValueError(response.text + f' -> STATUS CODE: {response.status_code}') + response.raise_for_status() + # crashes if the server couldn't use it or it never made it. + except requests.exceptions.HTTPError as error: + raise error + + # {'data':{}, 'success':bool} + return response.json()['data'] + + def get(self, endpoint: str, data: dict = None) -> dict: + """ Quick GET for SciCrunch. """ + return self.__session_shortcut(endpoint, data, 'GET') + + def post(self, endpoint: str , data: dict = None) -> dict: + """ Quick POST for SciCrunch. """ + return self.__session_shortcut(endpoint, data, 'POST') diff --git a/ilxutils/ilxutils/sql.py b/ilxutils/ilxutils/sql.py index c4176a71..b3470e74 100644 --- a/ilxutils/ilxutils/sql.py +++ b/ilxutils/ilxutils/sql.py @@ -1,4 +1,4 @@ -from interlex_sql import IlxSql +from .interlex_sql import IlxSql import os def production_sql(from_backup=True): From 53d71c4ff2323fe06c72aa0a781408f4dc5b3d8a Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Fri, 10 Apr 2020 13:28:58 -0700 Subject: [PATCH 07/16] updated tutorials --- .../tutorials/interlex_remotes_tutorial.ipynb | 182 +++++++----------- 1 file changed, 70 insertions(+), 112 deletions(-) diff --git a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb index 85b28da4..db9004a3 100644 --- a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb +++ b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb @@ -42,24 +42,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'NoneType' object has no attribute 'exists'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mfetch_grid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m \u001b[0;31m# meta data in self.grid that has detials like bolding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mbrainstem\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBrainstem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbrainstem\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Dropbox/git/pyontutils/pyontutils/sheets.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, sheet_name, fetch_grid, readonly)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadonly\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreadonly\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 210\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 211\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Dropbox/git/pyontutils/pyontutils/sheets.py\u001b[0m in \u001b[0;36m_setup\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadonly\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSheet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__spreadsheet_service_ro'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 216\u001b[0;31m \u001b[0mservice\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_oauth_service\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreadonly\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadonly\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# I think it is correct to keep this ephimoral\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 217\u001b[0m \u001b[0mSheet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__spreadsheet_service_ro\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspreadsheets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Dropbox/git/pyontutils/pyontutils/sheets.py\u001b[0m in \u001b[0;36mget_oauth_service\u001b[0;34m(api, version, readonly, SCOPES)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0mstore_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mauth\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_auth_var\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mstore_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstore_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'exists'" - ] - } - ], + "outputs": [], "source": [ "from pyontutils.sheets import Sheet\n", "import pandas as pd\n", @@ -136,29 +121,17 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[OntTerm('HBA:3999', label='brain (hba)'),\n", - " OntTerm('FMA:50801', label='Brain'),\n", - " OntTerm('UBERON:0000955', label='brain'),\n", - " OntTerm('UBERON:6110636', label='adult cerebral ganglion'),\n", - " OntTerm('ILX:0101431', label='Brain'),\n", - " OntTerm('ILX:0101433', label='Brain Infarction'),\n", - " OntTerm('ILX:0506386', label='Brain Aneurysm'),\n", - " OntTerm('ILX:0433050', label='Brain Chemistry'),\n", - " OntTerm('ILX:0641746', label='alpha BRAIN'),\n", - " OntTerm('ILX:0726394', label='brain meninx'),\n", - " OntTerm('ILX:0729002', label='brain commissure'),\n", - " OntTerm('ILX:0101434', label='Brain Ischemia'),\n", - " OntTerm('ILX:0461406', label='Brain Death'),\n", - " OntTerm('ILX:0733041', label='brain endothelium')]" + "[OntTerm('UBERON:0000955', label='brain'),\n", + " OntTerm('UBERON:6110636', label='adult cerebral ganglion')]" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -171,17 +144,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[OntTerm('ILX:0103358', label='DN1 neuron'),\n", - " OntTerm('ILX:0109525', label='Pupal DN1 period neuron')]" + "[]" ] }, - "execution_count": 4, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -189,12 +161,12 @@ "source": [ "# similar entities will show\n", "# default limit is 10\n", - "query(term='DN1 neuron', limit=2) " + "query(term='brain', limit=10, prefix=('ILX')) " ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -203,7 +175,7 @@ "[OntTerm('UBERON:0000955', label='brain')]" ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -215,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -223,20 +195,6 @@ "text/plain": [ "{'prefix': 'UBERON',\n", " 'suffix': '0000955',\n", - " 'orig_kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", - " 'curie_or_iri': None,\n", - " 'label': None,\n", - " 'term': None,\n", - " 'search': None,\n", - " 'validated': None,\n", - " 'query': None},\n", - " 'kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", - " 'curie_or_iri': None,\n", - " 'label': None,\n", - " 'term': None,\n", - " 'search': None,\n", - " 'validated': None,\n", - " 'query': None},\n", " 'label': 'brain',\n", " 'labels': ['brain'],\n", " 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].',\n", @@ -250,12 +208,12 @@ " '_type': OntId('owl:Class'),\n", " '_types': (OntId('owl:Class'),),\n", " '_graph': None,\n", - " '_source': ,\n", + " '_source': ,\n", " 'validated': True,\n", - " '_query_result': QueryResult({'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955', 'curie': 'UBERON:0000955', 'label': 'brain', 'labels': ['brain'], 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].', 'synonyms': ['the brain', 'synganglion', 'suprasegmental structures', 'suprasegmental levels of nervous system', 'encephalon'], 'deprecated': False, 'predicates': {}, 'type': OntId('owl:Class'), 'types': (OntId('owl:Class'),), '_graph': None, 'source': })}" + " '_query_result': QueryResult({'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955', 'curie': 'UBERON:0000955', 'label': 'brain', 'labels': ['brain'], 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].', 'synonyms': ['the brain', 'synganglion', 'suprasegmental structures', 'suprasegmental levels of nervous system', 'encephalon'], 'deprecated': False, 'predicates': {}, 'type': OntId('owl:Class'), 'types': (OntId('owl:Class'),), '_graph': None, 'source': })}" ] }, - "execution_count": 6, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -277,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -301,9 +259,12 @@ "\u001b[0;34m\u001b[0m \u001b[0mdirection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'OUTGOING'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0minclude_deprecated\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minclude_supers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minclude_all_services\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mraw\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mType:\u001b[0m OntQueryCli\n", - "\u001b[0;31mString form:\u001b[0m \n", + "\u001b[0;31mString form:\u001b[0m \n", "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/query.py\n", "\u001b[0;31mDocstring:\u001b[0m \n" ] @@ -332,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -350,7 +311,7 @@ " [OntTerm('UBERON:6110636', label='adult cerebral ganglion')])]" ] }, - "execution_count": 8, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -391,30 +352,12 @@ " # We do this in case 2+ queries return the same results & the output WILL NOT have the same input order\n", " gin = lambda kwargs: (kwargs, query(**kwargs))\n", " # run each query instance at the same time\n", - " results = Async(use_nest_asyncio=True)(deferred(gin)(kwargs) for kwargs in kwargs_list)\n", + " results = Async()(deferred(gin)(kwargs) for kwargs in kwargs_list)\n", " return results \n", "\n", "queries([{'curie':'UBERON:0000955'}, {'curie':'UBERON:6110636'}])" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyontutils.utils import Async, deferred\n", - "from pyontutils.core import OntTerm, ixr, query\n", - "from typing import List, Tuple\n", - "def queries(url_list:List[dict]) -> List[Tuple[str, dict]]:\n", - " def gin(url):\n", - " return requests.get(url).text\n", - " # run each query instance at the same time\n", - " results = Async(limit=5)(deferred(gin)(url) for url in url_list)\n", - " return results \n", - "list_tuples(url, html)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -424,12 +367,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# TEST InterLex endpoints\n", - "from ilxutils.remotes import interlex_remote_test as ixrt" + "from ilxutils.remotes import remote\n", + "ixrt = remote(server='test3')" ] }, { @@ -441,7 +385,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -453,35 +397,35 @@ " 'orig_cid': '0',\n", " 'cid': '0',\n", " 'ilx': 'ilx_0738390',\n", - " 'label': 'Offical label',\n", + " 'label': 'official test label',\n", " 'type': 'term',\n", - " 'definition': 'official definition',\n", + " 'definition': 'definition',\n", " 'comment': 'helpful misc',\n", - " 'version': '3',\n", + " 'version': '2',\n", " 'status': '0',\n", " 'display_superclass': '1',\n", " 'orig_time': '1564695195',\n", - " 'time': '1570826848',\n", - " 'synonyms': [{'id': '1776645',\n", + " 'time': '1564695333',\n", + " 'synonyms': [{'id': '1776589',\n", " 'tid': '661544',\n", " 'literal': 'Encephalon',\n", " 'type': '',\n", - " 'time': '1570826848',\n", - " 'version': '3'},\n", - " {'id': '1776646',\n", + " 'time': '1564695333',\n", + " 'version': '2'},\n", + " {'id': '1776590',\n", " 'tid': '661544',\n", " 'literal': 'Cerebro',\n", " 'type': '',\n", - " 'time': '1570826848',\n", - " 'version': '3'}],\n", + " 'time': '1564695333',\n", + " 'version': '2'}],\n", " 'superclasses': [],\n", - " 'existing_ids': [{'id': '3885545',\n", + " 'existing_ids': [{'id': '3885425',\n", " 'tid': '661544',\n", " 'curie': 'ILX:0738390',\n", " 'iri': 'http://uri.interlex.org/base/ilx_0738390',\n", " 'curie_catalog_id': '3885424',\n", - " 'version': '3',\n", - " 'time': '1570826848',\n", + " 'version': '2',\n", + " 'time': '1564695334',\n", " 'preferred': '1'}],\n", " 'relationships': [],\n", " 'mappings': [],\n", @@ -489,7 +433,7 @@ " 'ontologies': []}" ] }, - "execution_count": 2, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -507,7 +451,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -522,9 +466,11 @@ "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mpredicates\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mexisting_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mDocstring:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", + "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services/interlex.py\n", "\u001b[0;31mType:\u001b[0m method\n" ] }, @@ -538,14 +484,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'official test label', 'labels': (), 'definition': 'definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" + "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'official test label', 'labels': (), 'definition': 'definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" ] } ], @@ -574,7 +520,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -590,10 +536,20 @@ "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_add\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0madd_existing_ids\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdelete_existing_ids\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_delete\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcid\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", + "\u001b[0;31mDocstring:\u001b[0m\n", + "Update existing entity.\n", + "\n", + ":param List[dict] add_existing_ids: iris and curies to be added to entity.\n", + ":param List[dict] delete_existing_ids: iris and curies to be deleted from entity.\n", + "\n", + ">>>update_entity(add_existing_ids=[{'ilx_id':'ilx_1234567', 'iri':'http://abc.org/abc_123', 'curie':'ABC:123'}])\n", + ">>>update_entity(delete_existing_ids=[{'ilx_id':'ilx_1234567', 'iri':'http://abc.org/abc_123', 'curie':'ABC:123'}])\n", + "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services/interlex.py\n", "\u001b[0;31mType:\u001b[0m method\n" ] }, @@ -607,26 +563,27 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m[2019-10-11 13:47:28,619]\u001b[0m - \u001b[32m INFO\u001b[0m - ontquery - \u001b[34minterlex_client.py:796 \u001b[0m - {'ilx_id': 'ILX:0738390', 'label': 'Offical label', 'type': 'term', 'definition': 'official definition', 'comment': 'helpful misc', 'superclass': '', 'synonyms': ['Encephalon', 'Cerebro']}\u001b[0m\n" + "\u001b[32m[2020-04-10 13:25:53,802]\u001b[0m - \u001b[32m INFO\u001b[0m - ontquery - \u001b[34minterlex_client.py:962 \u001b[0m - {'ilx_id': 'ILX:0738390', 'label': 'Offical label', 'type': 'term', 'definition': 'official definition', 'comment': 'helpful misc', 'superclass': '', 'synonyms': ['Encephalon', 'Cerebro'], 'add_existing_ids': None, 'delete_existing_ids': None, 'status': '0', 'cid': None}\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'Offical label', 'labels': (), 'definition': 'official definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" + "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'Offical label', 'labels': (), 'definition': 'official definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" ] } ], "source": [ - "from ilxutils.remotes import interlex_remote_test as ixrt\n", + "from ilxutils.remotes import remote\n", + "ixrt = remote(server='test3')\n", "entity = dict(\n", " ilx_id = 'ILX:0738390',\n", " label = 'Offical label', # Can only one unique label per person\n", @@ -654,12 +611,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# PRODUCTION \n", - "from ilxutils.remotes import interlex_remote_production as ixr\n", + "from ilxutils.remotes import remote \n", + "ixr = remote()\n", "# BE CAREFUL :)" ] } @@ -680,7 +638,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.6" } }, "nbformat": 4, From 3e5e5c99f7a89e4afd2b2dd8bf74a198ee7adc1b Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Fri, 10 Apr 2020 14:33:25 -0700 Subject: [PATCH 08/16] fixed bugs --- ilxutils/ilxutils/interlex_sql.py | 223 ++++++++++-------- ilxutils/ilxutils/remotes.py | 2 +- ilxutils/ilxutils/scicrunch_session.py | 113 ++++++++- .../tutorials/interlex_remotes_tutorial.ipynb | 59 +++++ 4 files changed, 284 insertions(+), 113 deletions(-) diff --git a/ilxutils/ilxutils/interlex_sql.py b/ilxutils/ilxutils/interlex_sql.py index ce45497b..733c410f 100755 --- a/ilxutils/ilxutils/interlex_sql.py +++ b/ilxutils/ilxutils/interlex_sql.py @@ -1,33 +1,84 @@ +from collections import defaultdict +import os from pathlib import Path +from typing import Union, Dict, Tuple, List + import pandas as pd from sqlalchemy import create_engine, inspect, Table, Column -from collections import defaultdict + from ilxutils.tools import light_degrade, open_pickle, create_pickle -import os -#ELASTIC = 'https://5f86098ac2b28a982cebf64e82db4ea2.us-west-2.aws.found.io:9243/interlex/term/' -TERMS_COMPLETE_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_terms_complete_backup.pickle' -TERMS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_terms_backup.pickle' -ANNOS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_annotations_backup.pickle' -RELAS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_relationships_backup.pickle' -SUPER_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_superclasses_backup.pickle' -SYNOS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_synonyms_backup.pickle' -EXIDS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_ex_backup.pickle' + + +TERMSC_BACKUP = 'ilx_db_terms_complete_backup.pickle' +TERMS_BACKUP = 'ilx_db_terms_backup.pickle' +ANNOS_BACKUP = 'ilx_db_annotations_backup.pickle' +RELAS_BACKUP = 'ilx_db_relationships_backup.pickle' +SUPER_BACKUP = 'ilx_db_superclasses_backup.pickle' +SYNOS_BACKUP = 'ilx_db_synonyms_backup.pickle' +EXIDS_BACKUP = 'ilx_db_existing_ids_backup.pickle' + + +def pathing(path, check_path=False): + """ Extract absolute path from shortened or relative paths. + + :param str path: path to file or folder. + + :examples: + >>>pathing('~/shortened.filepath') + >>>pathing('../relative.filepath') + >>>pathing('relative.filepath') + >>>pathing('/home/absoulte/filepath') + """ + path = Path(path) + if str(path).startswith('~'): + path = path.expanduser() + else: + path = path.resolve() + + if check_path: + if not path.is_file() and path.is_dir(): + raise ValueError(f'{path} does not exit') + + return path class IlxSql(): - def __init__(self, db_url, pre_load=False, from_backup=False): - self.db_url = db_url - self.engine = create_engine(self.db_url) - self.local_degrade = lambda string: string.lower().strip() # current degrade of choice for sql + def __init__(self, + db_url: str, + from_backup: bool = False, + pre_load: bool = False, + backups_folder: str = '~/.interlex_backups'): + + self.engine = create_engine(db_url) self.from_backup = from_backup - self.terms_complete = self.get_terms_complete() if pre_load else pd.DataFrame - self.terms = self.get_terms() if pre_load else pd.DataFrame - self.superclasses = self.get_superclasses if pre_load else pd.DataFrame - self.annotations = self.get_annotations() if pre_load else pd.DataFrame - self.existing_ids = self.get_existing_ids() if pre_load else pd.DataFrame - self.relationships = self.get_relationships() if pre_load else pd.DataFrame - self.synonyms = self.get_synonyms() if pre_load else pd.DataFrame + + self.save_folder = pathing(backups_folder) + try: + self.save_folder.mkdir() + except: + pass + + self.terms = pd.DataFrame + self.superclasses = pd.DataFrame + self.annotations = pd.DataFrame + self.existing_ids = pd.DataFrame + self.relationships = pd.DataFrame + self.synonyms = pd.DataFrame + + # Auto load tables from backup + if pre_load: + self.pre_loader() + + + def pre_loader(self): + # self.terms_complete = self.get_terms_complete() if pre_load else pd.DataFrame + self.terms = self.get_terms() + # self.superclasses = self.get_superclasses() + self.existing_ids = self.get_existing_ids() + self.synonyms = self.get_synonyms() + # self.relationships = self.get_relationships() + # self.annotations = self.get_annotations() def fetch_terms_complete(self): if self.terms_complete.empty: @@ -66,29 +117,24 @@ def fetch_superclasses(self): def get_terms(self): ''' GROUP BY is a shortcut to only getting the first in every list of group ''' - if not self.terms.empty: - return self.terms if self.from_backup: - self.terms = open_pickle(TERMS_BACKUP_PATH) + self.terms = open_pickle(self.save_folder / TERMS_BACKUP) return self.terms - engine = create_engine(self.db_url) - data = """ + sql_query = """ SELECT t.id as tid, t.ilx, t.label, t.definition, t.type, t.comment, t.version, t.uid, t.cid, t.time, t.status FROM terms t GROUP BY t.ilx + HAVING t.status = '0' """ - self.terms = pd.read_sql(data, engine) - create_pickle(self.terms, TERMS_BACKUP_PATH) + self.terms = pd.read_sql(sql_query, self.engine) + create_pickle(self.terms, self.save_folder / TERMS_BACKUP) return self.terms def get_annotations(self): - if not self.annotations: - return self.fetch_annotations() if self.from_backup: - self.annotations = open_pickle(ANNOS_BACKUP_PATH) + self.annotations = open_pickle(self.save_folder / ANNOS_BACKUP) return self.annotations - engine = create_engine(self.db_url) - data = """ + sql_query = """ SELECT ta.tid, ta.annotation_tid as annotation_type_tid, t1.ilx as term_ilx, t2.ilx as annotation_type_ilx, @@ -99,46 +145,43 @@ def get_annotations(self): SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) AS t1 ON ta.tid=t1.id JOIN ( SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) AS t2 ON ta.annotation_tid=t2.id """ - self.annotations = pd.read_sql(data, engine) - create_pickle(self.annotations, ANNOS_BACKUP_PATH) + self.annotations = pd.read_sql(sql_query, self.engine) + create_pickle(self.annotations, self.save_folder / ANNOS_BACKUP) return self.annotations def get_existing_ids(self): - if not self.existing_ids.empty: - return self.existing_ids if self.from_backup: - self.existing_ids = open_pickle(EXIDS_BACKUP_PATH) + self.existing_ids = open_pickle(self.save_folder / EXIDS_BACKUP) return self.existing_ids - engine = create_engine(self.db_url) - data = """ + sql_query = """ SELECT tei.tid, tei.curie, tei.iri, tei.preferred, t.ilx, t.label, t.definition, t.status FROM ( SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) as t JOIN term_existing_ids AS tei ON t.id = tei.tid """ - self.existing_ids = pd.read_sql(data, engine) - create_pickle(self.existing_ids, EXIDS_BACKUP_PATH) + self.existing_ids = pd.read_sql(sql_query, self.engine) + create_pickle(self.existing_ids, self.save_folder / EXIDS_BACKUP) return self.existing_ids def get_relationships(self): - if not self.relationships.empty: - return self.relationships if self.from_backup: - self.relationships = open_pickle(RELAS_BACKUP_PATH) + self.relationships = open_pickle(self.save_folder / RELAS_BACKUP) return self.relationships - engine = create_engine(self.db_url) - data = """ + sql_query = """ SELECT t1.id as term1_tid, t1.ilx AS term1_ilx, t1.type as term1_type, t1.label as term1_label, t2.id as term2_tid, t2.ilx AS term2_ilx, t2.type as term2_type, t2.label as term2_label, @@ -163,18 +206,15 @@ def get_relationships(self): HAVING terms.status = '0' ) AS t3 ON t3.id = tr.relationship_tid """ - self.relationships = pd.read_sql(data, engine) - create_pickle(self.relationships, RELAS_BACKUP_PATH) + self.relationships = pd.read_sql(sql_query, self.engine) + create_pickle(self.relationships, self.save_folder / RELAS_BACKUP) return self.relationships def get_superclasses(self): - if not self.superclasses.empty: - return self.superclasses if self.from_backup: - self.superclasses = open_pickle(SUPER_BACKUP_PATH) + self.superclasses = open_pickle(self.save_folder / SUPER_BACKUP) return self.superclasses - engine = create_engine(self.db_url) - data = """ + sql_query = """ SELECT ts.tid, ts.superclass_tid, t1.label as term_label, t1.ilx as term_ilx, @@ -184,46 +224,44 @@ def get_superclasses(self): SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) as t1 ON t1.id = ts.tid JOIN ( SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) AS t2 ON t2.id = ts.superclass_tid """ - self.superclasses = pd.read_sql(data, engine) - create_pickle(self.superclasses, SUPER_BACKUP_PATH) + self.superclasses = pd.read_sql(sql_query, self.engine) + create_pickle(self.superclasses, self.save_folder / SUPER_BACKUP) return self.superclasses def get_synonyms(self): - if not self.synonyms.empty: - return self.synonyms if self.from_backup: - self.synonyms = open_pickle(SYNOS_BACKUP_PATH) + self.synonyms = open_pickle(self.save_folder / SYNOS_BACKUP) return self.synonyms - engine = create_engine(self.db_url) - data = """ + sql_query = """ SELECT ts.tid as tid, t.ilx, ts.literal, ts.type FROM term_synonyms AS ts JOIN ( SELECT * FROM terms GROUP BY terms.ilx + HAVING terms.status = '0' ) AS t WHERE ts.tid=t.id """ - self.synonyms = pd.read_sql(data, engine) - create_pickle(self.synonyms, SYNOS_BACKUP_PATH) + self.synonyms = pd.read_sql(sql_query, self.engine) + create_pickle(self.synonyms, self.save_folder / SYNOS_BACKUP) return self.synonyms def get_terms_complete(self) -> pd.DataFrame: ''' Gets complete entity data like term/view ''' - if not self.terms_complete.empty: - return self.terms_complete if self.from_backup: - self.terms_complete = open_pickle(TERMS_COMPLETE_BACKUP_PATH) + self.terms_complete = open_pickle(self.save_folder / TERMSC_BACKUP) return self.terms_complete ilx2synonyms = self.get_ilx2synonyms() ilx2existing_ids = self.get_ilx2existing_ids() @@ -239,37 +277,22 @@ def get_terms_complete(self) -> pd.DataFrame: row['superclass'] = ilx2superclass.get(row['ilx']) ilx_complete.append(row) terms_complete = pd.DataFrame(ilx_complete) - create_pickle(terms_complete, TERMS_COMPLETE_BACKUP_PATH) + create_pickle(terms_complete, self.save_folder / TERMSC_BACKUP) return terms_complete - def get_label2id(self): - self.terms = self.fetch_terms() - visited = {} - label_to_id = defaultdict(lambda: defaultdict(list)) - for row in self.terms.itertuples(): - label = self.local_degrade(row.label) - if not visited.get((label, row.type, row.ilx)): - if row.type == 'term': - label_to_id[label]['term'].append(int(row.id)) - visited[(label, row.type, row.ilx)] = True - elif row.type == 'cde': - label_to_id[label]['cde'].append(int(row.id)) - visited[(label, row.type, row.ilx)] = True - elif row.type == 'fde': - label_to_id[label]['fde'].append(int(row.id)) - visited[(label, row.type, row.ilx)] = True - return label_to_id - - def get_label2ilxs(self): - self.terms = self.fetch_terms() - visited = {} - label_to_ilx = defaultdict(list) - for row in self.terms.itertuples(): - label = self.local_degrade(row.label) - if not visited.get((label, row.type, row.ilx)): - label_to_ilx[label].append(str(row.ilx)) - visited[(label, row.type, row.ilx)] = True - return label_to_ilx + def get_label2id(self, clean: object = None) -> Dict[str, list]: + if not clean: + clean = lambda string: string.lower().strip() + label2id = defaultdict(list) + [label2id[clean(row.label)].append(row.id) for row in self.fetch_terms().itertuples()] + return label2id + + def get_label2ilx(self, clean: object = None) -> Dict[str, list]: + if not clean: + clean = lambda string: string.lower().strip() + label2ilx = defaultdict(list) + [label2ilx[clean(row.label)].append(row.ilx) for row in self.fetch_terms().itertuples()] + return label2ilx def get_label2rows(self): self.terms_complete = self.fetch_terms_complete() @@ -446,19 +469,19 @@ def get_fragment2rows(self): return fragement2rows def show_tables(self): - data = "SHOW tables;" - return pd.read_sql(data, self.engine) + sql_query = "SHOW tables;" + return pd.read_sql(sql_query, self.engine) def get_table(self, tablename, limit=5): - data = """ + sql_query = """ SELECT * FROM {tablename} LIMIT {limit} """.format(tablename=tablename, limit=limit) - return pd.read_sql(data, self.engine) + return pd.read_sql(sql_query, self.engine) def get_custom(self, data): - return pd.read_sql(data, self.engine) + return pd.read_sql(sql_query, self.engine) def main(): diff --git a/ilxutils/ilxutils/remotes.py b/ilxutils/ilxutils/remotes.py index 711766c2..76d99fd3 100644 --- a/ilxutils/ilxutils/remotes.py +++ b/ilxutils/ilxutils/remotes.py @@ -8,7 +8,7 @@ def remote(server=''): InterLexRemote = oq.plugin.get('InterLex') if server: - server = server if server.endswith('.') else server + '.' + server = server if server.endswith('.') else server + '.' endpoint = f'https://{server}scicrunch.org/api/1/' # diff --git a/ilxutils/ilxutils/scicrunch_session.py b/ilxutils/ilxutils/scicrunch_session.py index b1b0bfa2..459d8d13 100644 --- a/ilxutils/ilxutils/scicrunch_session.py +++ b/ilxutils/ilxutils/scicrunch_session.py @@ -1,16 +1,26 @@ import json -import requests -from typing import Union, Dict, List +from typing import Union, Dict, List, Tuple from urllib.parse import urljoin +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +import nest_asyncio +nest_asyncio.apply() + +from pyontutils.utils import Async, deferred -class ScicrunchSession: + +class SciCrunchSession: """ Boiler plate for SciCrunch server responses. """ def __init__(self, key: str, - host: str = 'scicrunch.org', - auth: tuple = (None, None)) -> None: + host: str = 'scicrunch.org', # MAIN TEST -> test3.scicrunch.org + auth: tuple = ('scicrunch', 'perl22(query)'), + retries: int = 3, + backoff_factor: float = 1.0, + status_forcelist: tuple = (400, 500, 502, 504),) -> None: """ Initialize Session with SciCrunch Server. :param str key: API key for SciCrunch [should work for test hosts]. @@ -30,6 +40,16 @@ def __init__(self, self.session = requests.Session() self.session.auth = auth self.session.headers.update({'Content-type': 'application/json'}) + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, # 400 for no ILX ID generated. + ) + # adapter = HTTPAdapter(max_retries=retry) + # self.session.mount('http://', adapter) + # self.session.mount('https://', adapter) def __session_shortcut(self, endpoint: str, data: dict, session_type: str = 'GET') -> dict: """ Short for both GET and POST. @@ -46,7 +66,13 @@ def _prepare_data(data: dict) -> dict: raise ValueError('request session data must be of type dictionary') return json.dumps(data) + # urljoin bug; .com/ap1/1/ + /test/ != .com/ap1/1/test/ but .com/test/ + # HOWEVER .com/ap1/1/ + test/ == .com/ap1/1/test/ + endpoint = endpoint[1:] if endpoint.startswith('/') else endpoint url = urljoin(self.api, endpoint) + if data: + for key, value in data.items(): + url = url.format(**{key:value}) data = _prepare_data(data) try: # TODO: Could use a Request here to shorten code. @@ -55,20 +81,83 @@ def _prepare_data(data: dict) -> dict: else: response = self.session.post(url, data=data) # crashes if success on the server side is False - if not response.json()['success']: - raise ValueError(response.text + f' -> STATUS CODE: {response.status_code}') + if response.json()['success'] == False: + # Need to retry if server fails to create the ILX ID. + if response.json().get('errormsg') == 'could not generate ILX identifier': + return response.json() + raise ValueError(response.text + f' -> STATUS CODE: {response.status_code} @ URL: {response.url}') response.raise_for_status() # crashes if the server couldn't use it or it never made it. - except requests.exceptions.HTTPError as error: - raise error + except: + raise requests.exceptions.HTTPError(f'{response.text} {response.status_code}') - # {'data':{}, 'success':bool} + # response.json() == {'data':{}, 'success':bool} return response.json()['data'] - def get(self, endpoint: str, data: dict = None) -> dict: + def _get(self, endpoint: str, data: dict = None) -> dict: """ Quick GET for SciCrunch. """ return self.__session_shortcut(endpoint, data, 'GET') - def post(self, endpoint: str , data: dict = None) -> dict: + def _post(self, endpoint: str , data: dict = None) -> dict: """ Quick POST for SciCrunch. """ return self.__session_shortcut(endpoint, data, 'POST') + + def get(self, func, data_list=None) -> List[Tuple[str, dict]]: + if not data_list: + return self._get(endpoint) + # worker + gin = lambda data: self._get(endpoint, data) + # Builds futures dynamically + return Async()(deferred(gin)(data) for data in data_list) + + def post(self, func: object, data_list: list) -> List[Tuple[str, dict]]: + # worker; return server_response first then initial data input + gin = lambda data: (data, func(data)) + + # Builds futures dynamically + responses = Async()(deferred(gin)(data) for data in data_list) + + # BUG: ilx_ids are created on the PHP side and are slow. Duplicates + # are known to be created "func hit at same time" so we need to a new + # session and try again. + number_of_batch_retries = 0 + while number_of_batch_retries < 10: + data_queue = [] + for response in responses: + data, server_response = response + print(server_response) + if server_response.get('errormsg') == 'could not generate ILX identifier': + data_queue.append(data) + if data_queue == []: + break + responses = Async()(deferred(gin)(data) for data in data_queue) + number_of_batch_retries += 1 + return + + def get(self, urls, limit=5): + + async def get_single(url, session, auth): + async with session.get(url) as response: + try: + output = await response.json() + except: + output = await response.text() + ValueError(f'{output} with status code [{response.status}]') + return output + + async def get_all(urls, connector, loop): + tasks = [] + async with ClientSession(connector=connector, loop=loop, + auth=self.auth, raise_for_status=True) as session: + for i, url in enumerate(urls): + task = asyncio.ensure_future(get_single(url, session, self.auth)) + tasks.append(task) + return (await asyncio.gather(*tasks)) + + # rate limiter; should be between 20 and 80; 100 maxed out server + connector = TCPConnector(limit=limit) + loop = asyncio.get_event_loop() # event loop initialize + # tasks to do; data is in json format [{},] + future = asyncio.ensure_future(get_all(urls, connector, loop)) + outputs = loop.run_until_complete(future) # loop until done + return {k: v for keyval in outputs for k, v in keyval.items()} diff --git a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb index db9004a3..5a27e35a 100644 --- a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb +++ b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb @@ -620,6 +620,65 @@ "ixr = remote()\n", "# BE CAREFUL :)" ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '661544',\n", + " 'orig_uid': '34142',\n", + " 'uid': '34142',\n", + " 'orig_cid': '0',\n", + " 'cid': '0',\n", + " 'ilx': 'ilx_0738390',\n", + " 'label': 'Offical label',\n", + " 'type': 'term',\n", + " 'definition': 'official definition',\n", + " 'comment': 'helpful misc',\n", + " 'version': '3',\n", + " 'status': '0',\n", + " 'display_superclass': '1',\n", + " 'orig_time': '1564695195',\n", + " 'time': '1586550353',\n", + " 'synonyms': [{'id': '1845765',\n", + " 'tid': '661544',\n", + " 'literal': 'Encephalon',\n", + " 'type': '',\n", + " 'time': '1586550353',\n", + " 'version': '3'},\n", + " {'id': '1845766',\n", + " 'tid': '661544',\n", + " 'literal': 'Cerebro',\n", + " 'type': '',\n", + " 'time': '1586550353',\n", + " 'version': '3'}],\n", + " 'superclasses': [],\n", + " 'existing_ids': [{'id': '4972084',\n", + " 'tid': '661544',\n", + " 'curie': 'ILX:0738390',\n", + " 'iri': 'http://uri.interlex.org/base/ilx_0738390',\n", + " 'curie_catalog_id': '3885424',\n", + " 'version': '3',\n", + " 'time': '1586550353',\n", + " 'preferred': '1'}],\n", + " 'relationships': [],\n", + " 'mappings': [],\n", + " 'annotations': [],\n", + " 'ontologies': []}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ixrt.ilx_cli.get_entity('ilx_0738390')" + ] } ], "metadata": { From e75362f0639e7eb67981e3ff24517c3dada781e6 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Wed, 20 May 2020 17:16:28 -0700 Subject: [PATCH 09/16] bug fixes --- ilxutils/ilxutils/interlex_sql.py | 2 +- ilxutils/ilxutils/scicrunch_session.py | 176 ++++++++++++++----------- 2 files changed, 101 insertions(+), 77 deletions(-) diff --git a/ilxutils/ilxutils/interlex_sql.py b/ilxutils/ilxutils/interlex_sql.py index 733c410f..1cf7e509 100755 --- a/ilxutils/ilxutils/interlex_sql.py +++ b/ilxutils/ilxutils/interlex_sql.py @@ -480,7 +480,7 @@ def get_table(self, tablename, limit=5): """.format(tablename=tablename, limit=limit) return pd.read_sql(sql_query, self.engine) - def get_custom(self, data): + def get_custom(self, sql_query): return pd.read_sql(sql_query, self.engine) diff --git a/ilxutils/ilxutils/scicrunch_session.py b/ilxutils/ilxutils/scicrunch_session.py index 459d8d13..a2030575 100644 --- a/ilxutils/ilxutils/scicrunch_session.py +++ b/ilxutils/ilxutils/scicrunch_session.py @@ -5,8 +5,6 @@ import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry -import nest_asyncio -nest_asyncio.apply() from pyontutils.utils import Async, deferred @@ -14,28 +12,50 @@ class SciCrunchSession: """ Boiler plate for SciCrunch server responses. """ + class Error(Exception): + """Script could not complete.""" + + class NoApiKeyError(Error): + """ No api key has been set """ + + class IncorrectAPIKeyError(Error): + """Incorrect API key for scicrunch website used.""" + def __init__(self, key: str, - host: str = 'scicrunch.org', # MAIN TEST -> test3.scicrunch.org - auth: tuple = ('scicrunch', 'perl22(query)'), - retries: int = 3, - backoff_factor: float = 1.0, - status_forcelist: tuple = (400, 500, 502, 504),) -> None: + host: str = 'test3.scicrunch.org', # MAIN TEST -> test3.scicrunch.org + auth: tuple = ('', ''), # user, password for authentication + retries: int = 3, # retries if code in status_forcelist + backoff_factor: float = 1.0, # delay factor for reties + status_forcelist: tuple = (500, 502, 504), # flagged codes for retry + ) -> None: """ Initialize Session with SciCrunch Server. :param str key: API key for SciCrunch [should work for test hosts]. :param str host: Base url for hosting server [can take localhost:8080]. - :param str user: username for test server. - :param str password: password for test server. """ self.key = key - self.host = host + self.host = '' + self.api = '' + + # Pull host for potential url + if host.startswith('http'): + host = urlparse(host).netloc - # https is only for security level environments - if self.host.startswith('localhost'): - self.api = "http://" + self.host + '/api/1/' + # Use host to create api url + if host.startswith('localhost'): + self.host = "http://" + host + self.api = self.host + '/api/1/' else: - self.api = "https://" + self.host + '/api/1/' + self.host = "https://" + host + self.api = self.host + '/api/1/' + + # Api key check + if self.key is None: # injected by orthauth + # Error here because viewing without a key handled in InterLexRemote not here + raise self.NoApiKeyError('You have not set an API key for the SciCrunch API!') + if not requests.get(self.api+'user/info', params={'key':self.key}).status_code in [200, 201]: + raise self.IncorrectAPIKeyError(f'api_key given is incorrect.') self.session = requests.Session() self.session.auth = auth @@ -47,9 +67,9 @@ def __init__(self, backoff_factor=backoff_factor, status_forcelist=status_forcelist, # 400 for no ILX ID generated. ) - # adapter = HTTPAdapter(max_retries=retry) - # self.session.mount('http://', adapter) - # self.session.mount('https://', adapter) + adapter = HTTPAdapter(max_retries=retry) + self.session.mount('http://', adapter) + self.session.mount('https://', adapter) def __session_shortcut(self, endpoint: str, data: dict, session_type: str = 'GET') -> dict: """ Short for both GET and POST. @@ -74,17 +94,15 @@ def _prepare_data(data: dict) -> dict: for key, value in data.items(): url = url.format(**{key:value}) data = _prepare_data(data) + # TODO: Could use a Request here to shorten code. + if session_type == 'GET': + response = self.session.get(url, data=data) + else: + response = self.session.post(url, data=data) try: - # TODO: Could use a Request here to shorten code. - if session_type == 'GET': - response = self.session.get(url, data=data) - else: - response = self.session.post(url, data=data) # crashes if success on the server side is False if response.json()['success'] == False: # Need to retry if server fails to create the ILX ID. - if response.json().get('errormsg') == 'could not generate ILX identifier': - return response.json() raise ValueError(response.text + f' -> STATUS CODE: {response.status_code} @ URL: {response.url}') response.raise_for_status() # crashes if the server couldn't use it or it never made it. @@ -102,62 +120,68 @@ def _post(self, endpoint: str , data: dict = None) -> dict: """ Quick POST for SciCrunch. """ return self.__session_shortcut(endpoint, data, 'POST') - def get(self, func, data_list=None) -> List[Tuple[str, dict]]: - if not data_list: - return self._get(endpoint) + def get(self, endpoint, data_list, tag=None) -> List[Tuple[str, dict]]: # worker - gin = lambda data: self._get(endpoint, data) + gin = lambda endpoint, data: (tag, self._get(endpoint, data)) # Builds futures dynamically - return Async()(deferred(gin)(data) for data in data_list) + return Async()(deferred(gin)(endpoint, data) for endpoint, data in zip(endpoint, data_list)) - def post(self, func: object, data_list: list) -> List[Tuple[str, dict]]: + def post(self, endpoint: object, data_list: list) -> List[Tuple[str, dict]]: # worker; return server_response first then initial data input - gin = lambda data: (data, func(data)) + gin = lambda data: (data, self._post(endpoint, data)) # Builds futures dynamically responses = Async()(deferred(gin)(data) for data in data_list) - - # BUG: ilx_ids are created on the PHP side and are slow. Duplicates - # are known to be created "func hit at same time" so we need to a new - # session and try again. - number_of_batch_retries = 0 - while number_of_batch_retries < 10: - data_queue = [] - for response in responses: - data, server_response = response - print(server_response) - if server_response.get('errormsg') == 'could not generate ILX identifier': - data_queue.append(data) - if data_queue == []: - break - responses = Async()(deferred(gin)(data) for data in data_queue) - number_of_batch_retries += 1 - return - - def get(self, urls, limit=5): - - async def get_single(url, session, auth): - async with session.get(url) as response: - try: - output = await response.json() - except: - output = await response.text() - ValueError(f'{output} with status code [{response.status}]') - return output - - async def get_all(urls, connector, loop): - tasks = [] - async with ClientSession(connector=connector, loop=loop, - auth=self.auth, raise_for_status=True) as session: - for i, url in enumerate(urls): - task = asyncio.ensure_future(get_single(url, session, self.auth)) - tasks.append(task) - return (await asyncio.gather(*tasks)) - - # rate limiter; should be between 20 and 80; 100 maxed out server - connector = TCPConnector(limit=limit) - loop = asyncio.get_event_loop() # event loop initialize - # tasks to do; data is in json format [{},] - future = asyncio.ensure_future(get_all(urls, connector, loop)) - outputs = loop.run_until_complete(future) # loop until done - return {k: v for keyval in outputs for k, v in keyval.items()} + return responses + + # def post(self, func: object, data_list: list) -> List[Tuple[str, dict]]: + # # worker; return server_response first then initial data input + # gin = lambda data: (data, func(data)) + # + # # Builds futures dynamically + # responses = Async()(deferred(gin)(data) for data in data_list) + # + # # BUG: ilx_ids are created on the PHP side and are slow. Duplicates + # # are known to be created "func hit at same time" so we need to a new + # # session and try again. + # number_of_batch_retries = 0 + # while number_of_batch_retries < 10: + # data_queue = [] + # for response in responses: + # data, server_response = response + # print(server_response) + # if server_response.get('errormsg') == 'could not generate ILX identifier': + # data_queue.append(data) + # if data_queue == []: + # break + # responses = Async()(deferred(gin)(data) for data in data_queue) + # number_of_batch_retries += 1 + # return + + # def get(self, urls, limit=5): + # + # async def get_single(url, session, auth): + # async with session.get(url) as response: + # try: + # output = await response.json() + # except: + # output = await response.text() + # ValueError(f'{output} with status code [{response.status}]') + # return output + # + # async def get_all(urls, connector, loop): + # tasks = [] + # async with ClientSession(connector=connector, loop=loop, + # auth=self.auth, raise_for_status=True) as session: + # for i, url in enumerate(urls): + # task = asyncio.ensure_future(get_single(url, session, self.auth)) + # tasks.append(task) + # return (await asyncio.gather(*tasks)) + # + # # rate limiter; should be between 20 and 80; 100 maxed out server + # connector = TCPConnector(limit=limit) + # loop = asyncio.get_event_loop() # event loop initialize + # # tasks to do; data is in json format [{},] + # future = asyncio.ensure_future(get_all(urls, connector, loop)) + # outputs = loop.run_until_complete(future) # loop until done + # return {k: v for keyval in outputs for k, v in keyval.items()} From c0c16924e96d8989bf0d8f156b69ddcb7eb6c49e Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Thu, 1 Oct 2020 13:24:59 -0700 Subject: [PATCH 10/16] gradual updates --- ilxutils/ilxutils/backup_ilx.py | 4 +- ilxutils/ilxutils/elasticsearch_wrapper.py | 14 +- ilxutils/ilxutils/interlex_sanity_checks.py | 4 + ilxutils/ilxutils/interlex_sql.py | 51 +++- ilxutils/ilxutils/remotes.py | 2 +- ilxutils/ilxutils/scicrunch_session.py | 2 +- ilxutils/ilxutils/sparql.ipynb | 296 ++++++++++++++++++++ 7 files changed, 354 insertions(+), 19 deletions(-) create mode 100644 ilxutils/ilxutils/interlex_sanity_checks.py create mode 100644 ilxutils/ilxutils/sparql.ipynb diff --git a/ilxutils/ilxutils/backup_ilx.py b/ilxutils/ilxutils/backup_ilx.py index 2a94a962..20006cf1 100644 --- a/ilxutils/ilxutils/backup_ilx.py +++ b/ilxutils/ilxutils/backup_ilx.py @@ -4,10 +4,12 @@ import pickle import os + def create_pickle(data, outfilename): with open(outfilename, 'wb') as outfile: pickle.dump(data, outfile) - + + sql = IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_PRODUCTION')) diff --git a/ilxutils/ilxutils/elasticsearch_wrapper.py b/ilxutils/ilxutils/elasticsearch_wrapper.py index ee9e8d76..e9767e0e 100644 --- a/ilxutils/ilxutils/elasticsearch_wrapper.py +++ b/ilxutils/ilxutils/elasticsearch_wrapper.py @@ -36,6 +36,17 @@ def search(self, body: dict, **kwargs) -> dict: """ return self.es.search(index=self.type, body=body, **kwargs) + def scroll(self, body: dict, size: int, **kwargs) -> dict: + body['size'] = 10000 + body['from'] = 0 + hits = [] + print(body) + for step in range(0, size, 10000): + hits += self.es.search(index=self.type, body=body, **kwargs)['hits']['hits'] + body['from'] = step + print(body) + return hits + def all_matches(self, sorting: str, size, start) -> dict: """First or last set of entities. @@ -67,7 +78,8 @@ class InterLexES(ElasticSearchTools): def __init__(self, beta=True): super().__init__( - host = BASHRC('INTERLEX_ELASTIC_URL'), + host = BASHRC('SCICRUNCH_ELASTIC_URL'), + # index = 'interlex_2019oct28', index = 'interlex', type = 'term', user = BASHRC('INTERLEX_ELASTIC_USER'), diff --git a/ilxutils/ilxutils/interlex_sanity_checks.py b/ilxutils/ilxutils/interlex_sanity_checks.py new file mode 100644 index 00000000..d953953b --- /dev/null +++ b/ilxutils/ilxutils/interlex_sanity_checks.py @@ -0,0 +1,4 @@ +from .sql import production_sql + +ilx_sql = production_sql(from_backup=True) +ex = ilx_sql.get_existing_ids() diff --git a/ilxutils/ilxutils/interlex_sql.py b/ilxutils/ilxutils/interlex_sql.py index 1cf7e509..d1badb67 100755 --- a/ilxutils/ilxutils/interlex_sql.py +++ b/ilxutils/ilxutils/interlex_sql.py @@ -42,7 +42,7 @@ def pathing(path, check_path=False): return path -class IlxSql(): +class IlxSql: def __init__(self, db_url: str, @@ -266,6 +266,7 @@ def get_terms_complete(self) -> pd.DataFrame: ilx2synonyms = self.get_ilx2synonyms() ilx2existing_ids = self.get_ilx2existing_ids() ilx2annotations = self.get_ilx2annotations() + ilx2relationships = self.get_ilx2relationships() ilx2superclass = self.get_ilx2superclass() ilx_complete = [] header = ['Index'] + list(self.fetch_terms().columns) @@ -292,20 +293,34 @@ def get_label2ilx(self, clean: object = None) -> Dict[str, list]: clean = lambda string: string.lower().strip() label2ilx = defaultdict(list) [label2ilx[clean(row.label)].append(row.ilx) for row in self.fetch_terms().itertuples()] + [label2ilx[clean(row.literal)].append(row.ilx) for row in self.fetch_synonyms().itertuples()] return label2ilx - def get_label2rows(self): - self.terms_complete = self.fetch_terms_complete() - visited = {} - label2rows = defaultdict(list) - header = ['Index'] + list(self.terms_complete.columns) - for row in self.terms_complete.itertuples(): - row = {header[i]:val for i, val in enumerate(row)} - label = self.local_degrade(row['label']) - if not visited.get((label, row['type'], row['ilx'])): - label2rows[label].append(row) - visited[(label, row['type'], row['ilx'])] = True - return label2rows + # def get_label2rows(self): + # self.terms = self.fetch_terms() + # visited = {} + # label2rows = defaultdict(list) + # header = ['Index'] + list(self.terms.columns) + # for row in self.terms.itertuples(): + # row = {header[i]: val for i, val in enumerate(row)} + # label = row['label'].lower().strip() + # if not visited.get((label, row['type'], row['ilx'])): + # label2rows[label].append(row) + # visited[(label, row['type'], row['ilx'])] = True + # return label2rows + + def get_label2rows(self, clean: object = None) -> Dict[str, list]: + if not clean: + clean = lambda string: string.lower().strip() + label2ilx = defaultdict(list) + [label2ilx[clean(row.label)].append(row) for row in self.fetch_terms().itertuples()] + [label2ilx[clean(row.literal)].append(row) for row in self.fetch_synonyms().itertuples()] + return label2ilx + + def get_ilx2synonyms(self) -> defaultdict(list): + ilx2synonyms = defaultdict(list) + [ilx2synonyms[row.ilx].append(row.literal) for row in self.fetch_synonyms().itertuples()] + return ilx2synonyms def get_definition2rows(self): self.terms = self.fetch_terms() @@ -313,8 +328,8 @@ def get_definition2rows(self): definition2rows = defaultdict(list) header = ['Index'] + list(self.terms.columns) for row in self.terms.itertuples(): - row = {header[i]:val for i, val in enumerate(row)} - definition = self.local_degrade(row['definition']) + row = {header[i]: val for i, val in enumerate(row)} + definition = row['definition'].lower().strip() if not definition or definition == ' ': continue if not visited.get((definition, row['type'], row['ilx'])): @@ -372,6 +387,12 @@ def get_tid2annotations(self, clean:bool=True): tid2annotations[row['tid']].append(row) return tid2annotations + def get_ilx2relationships(self): + ilx2relationships = defaultdict(list) + header = ['Index'] + list(self.fetch_relationships().columns) + for row in self.fetch_relationships().itertuples(): + row = {header[i]:val for i, val in enumerate(row)} + def get_ilx2annotations(self, clean:bool=True): ''' clean: for list of literals only ''' ilx2annotations = defaultdict(list) diff --git a/ilxutils/ilxutils/remotes.py b/ilxutils/ilxutils/remotes.py index 76d99fd3..979ae58f 100644 --- a/ilxutils/ilxutils/remotes.py +++ b/ilxutils/ilxutils/remotes.py @@ -15,7 +15,7 @@ def remote(server=''): interlex_remote = InterLexRemote() # setup inheritance classes - interlex_remote.setup(instrumented=OntTerm) interlex_remote.apiEndpoint = endpoint + interlex_remote.setup(instrumented=OntTerm) return interlex_remote diff --git a/ilxutils/ilxutils/scicrunch_session.py b/ilxutils/ilxutils/scicrunch_session.py index a2030575..1c68a422 100644 --- a/ilxutils/ilxutils/scicrunch_session.py +++ b/ilxutils/ilxutils/scicrunch_session.py @@ -37,7 +37,7 @@ def __init__(self, self.key = key self.host = '' self.api = '' - + # Pull host for potential url if host.startswith('http'): host = urlparse(host).netloc diff --git a/ilxutils/ilxutils/sparql.ipynb b/ilxutils/ilxutils/sparql.ipynb new file mode 100644 index 00000000..fb11bab8 --- /dev/null +++ b/ilxutils/ilxutils/sparql.ipynb @@ -0,0 +1,296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from pyontutils.core import OntResIri\n", + "g = OntResIri('https://cassava.ucsd.edu/sparc/exports/curation-export.ttl').graph" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "q2=\"\"\"\n", + "SELECT ?subj ?pred ?obj\n", + "WHERE {\n", + " TEMP:hasDerivedInformationAsParticipant ?obj .\n", + "?subj TEMP:hasDerivedInformationAsParticipant ?obj .\n", + "}\n", + "\"\"\"\n", + "\n", + "templates = SparqlQueryTemplates(g.namespace_manager)\n", + "q = templates.dataset_group(\n", + " subject='https://api.blackfynn.io/datasets/N:dataset:bc4071fd-aba1-4fe5-a59e-3da5affbc5fb/subjects/10653',\n", + ")\n", + "# print(q)\n", + "ts = []\n", + "sp = g.query(q2)" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from typing import Union, Dict, List, Tuple\n", + "import unittest\n", + "\n", + "import rdflib\n", + "from rdflib.plugins import sparql\n", + "import pytest\n", + "\n", + "from pyontutils.core import OntResIri\n", + "\n", + "Semantic = Union[rdflib.URIRef, rdflib.Literal, rdflib.BNode]\n", + "\n", + "\n", + "class TestCurationExportTtl:\n", + "\n", + " def __init__(self):\n", + " self.ori = OntResIri('https://cassava.ucsd.edu/sparc/exports/curation-export.ttl')\n", + " self.graph = self.ori.graph\n", + " self.spaql_templates = SparqlQueryTemplates(self.graph)\n", + "\n", + " def test_dataset_group(self):\n", + " \"\"\" sparql queries here \"\"\"\n", + " subj = rdflib.URIRef('https://api.blackfynn.io/datasets/N:dataset:c2a014b8-2c15-4269-b10a-3345420e3d56/subjects/53')\n", + " query = self.spaql_templates.dataset_group()\n", + " assert len(list(self.graph.query(query, initBindings={'target': subj}))) > 0\n", + "\n", + " def test_related_datasets(self):\n", + " subj = rdflib.util.from_n3('dataset:bec4d335-9377-4863-9017-ecd01170f354', nsm=self.graph)\n", + " query = self.spaql_templates.related_datasets()\n", + " assert len(list(self.graph.query(query, initBindings={'target': subj}))) > 0\n", + "\n", + "\n", + "class SparqlQueryTemplates:\n", + " \"\"\" Creates SPARQL query templates. \"\"\"\n", + "\n", + " def __init__(self, nsm=None):\n", + " self.nsm = nsm if nsm else rdflib.Graph().namespace_manager\n", + " self.prefixes = {p:ns for p, ns in self.nsm.namespaces() if p}\n", + "\n", + " def sparql_iri(self, iri: Union[rdflib.URIRef, str]) -> str:\n", + " \"\"\" Converts IRIs and curies to a usable format for SPARQL queries. \"\"\"\n", + " if iri.startswith('http') or isinstance(iri, rdflib.URIRef):\n", + " return '<'+str(iri)+'>'\n", + " return iri\n", + "\n", + " def dataset_group(self) -> str:\n", + " \"\"\" Get all subject groups and dataset associated with subject input.\n", + "\n", + " :returns: list of tuples containing: subject, subjects group, and subjects dataset.\n", + " \"\"\"\n", + " query = \"\"\"\n", + " SELECT ?subj ?group ?dataset\n", + " WHERE {\n", + " ?target TEMP:hasAssignedGroup ?group .\n", + " ?subj TEMP:hasAssignedGroup ?group .\n", + " ?subj TEMP:hasDerivedInformationAsParticipant ?dataset .\n", + " }\n", + " \"\"\"\n", + " return sparql.prepareQuery(query, initNs=self.prefixes)\n", + "\n", + " def related_datasets(self) -> str:\n", + " \"\"\" Get all related datasets of subject.\n", + "\n", + " :returns: list of tuples containing: subject & subjects shared dataset.\n", + " \"\"\"\n", + " query = \"\"\"\n", + " SELECT ?subj ?dataset\n", + " WHERE {\n", + " ?target TEMP:collectionTitle ?dataset .\n", + " ?subj TEMP:collectionTitle ?dataset .\n", + " }\n", + " \"\"\"\n", + " return sparql.prepareQuery(query, initNs=self.prefixes)\n", + " \n", + "TestCurationExportTtl().test_dataset_group()" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 205, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.test_dataset_group()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from rdflib import RDFS\n", + "from_n3('')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'rdflib.plugins' has no attribute 'sparql'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrdflib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplugins\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msparql\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepareQuery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitN\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: module 'rdflib.plugins' has no attribute 'sparql'" + ] + } + ], + "source": [ + "rdflib.plugins.sparql.prepareQuery(query, initN)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from rdflib.plugins import sparql" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rdflib==5.0.0.dev0\n" + ] + } + ], + "source": [ + "!pip3 freeze | grep rdflib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (, line 1)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m from rdflib.plugins\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], + "source": [ + "from rdflib.plugins.sp" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'rdflib' has no attribute 'plugins'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrdflib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mrdflib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplugins\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msparql\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepareQuery\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: module 'rdflib' has no attribute 'plugins'" + ] + } + ], + "source": [ + "import rdflib\n", + "rdflib.plugins.sparql.prepareQuery" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from pyontutils.core import OntResIri\n", + "from rdflib.plugins.sparql import prepareQuery" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 62cf03ef6ba3e50a0edfff94effc41ae814505ec Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Thu, 1 Oct 2020 16:30:54 -0700 Subject: [PATCH 11/16] hardcoded to merge specific ontologies --- pyontutils/ontload.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyontutils/ontload.py b/pyontutils/ontload.py index b987ab1e..24decc66 100755 --- a/pyontutils/ontload.py +++ b/pyontutils/ontload.py @@ -579,13 +579,15 @@ def inner(local_filepath, remote=False): def loadall(git_local, repo_name, local=False, dobig=False): local_base = jpth(git_local, repo_name) lb_ttl = os.path.realpath(jpth(local_base, 'ttl')) - + #match = (rdflib.term.URIRef('http://purl.org/dc/elements/1.1/member'), # iao.owl #rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), #rdflib.term.URIRef('http://www.w3.org/2002/07/owl#AnnotationProperty')) done = [] - filenames = [f for g in ('*', '*/*', '*/*/*') for f in glob(lb_ttl + '/' + g + '.ttl')] + git_path = '/home/tmsincomb/Dropbox/git/' + hardcoded_files = [repo_name + '/' + 'extra.ttl'] + [git_path+'fma_slim.ttl', git_path+'emapa.ttl', git_path+'uberon.ttl', git_path+'mondo.ttl'] + filenames = hardcoded_files + [f for g in ('*', '*/*', '*/*/*') for f in glob(lb_ttl + '/' + g + '.ttl')] graph = OntGraph() for f in filenames: print(f) @@ -598,12 +600,17 @@ def repeat(dobig=dobig): # we don't really know when to stop, so just adjust for s, o in graph.subject_objects(owl.imports): if os.path.basename(o) not in done and o not in done: #if (o, rdf.type, owl.Ontology) not in graph: + if not o.startswith('/home/tmsincomb'): + continue print(o) done.append(o) ext = os.path.splitext(o)[1] fmt = 'turtle' if ext == '.ttl' else 'xml' if noneMembers(o, *bigleaves) or dobig: - graph.parse(o, format=fmt) + try: + graph.parse(o, format=fmt) + except: + print('FAILED:', o) #if match in graph: #raise BaseException('Evil file found %s' % o) @@ -670,7 +677,7 @@ def inner(graph): records = {c:[l, s, p] for c, l, s, p in inner(graph) if l or s} with open(os.path.expanduser('~/files/ontology-classes-with-labels-synonyms-parents.json'), 'wt') as f: - json.dump(records, f, sort_keys=True, indent=2) + json.dump(records, f, sort_keys=True, indent=2) def make_post_clone(git_local, repo_name, remote_base): From 84b1093a2722f2a654f7b992ff47cb99f379be2c Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Thu, 1 Oct 2020 16:34:41 -0700 Subject: [PATCH 12/16] sync to tgbugs --- .../Tutorials/interlex_remotes_tutorial.ipynb | 742 ++++++++++++++++++ ilxutils/ilx-playground.ipynb | 16 +- ilxutils/ilxutils/backup_ilx.py | 10 +- ilxutils/ilxutils/interlex_sql.py | 295 +++---- ilxutils/ilxutils/nltklib.py | 3 +- ilxutils/ilxutils/ontopandas.py | 6 - ilxutils/ilxutils/remotes.py | 29 +- .../tutorials/interlex_remotes_tutorial.ipynb | 295 ++++--- 8 files changed, 1058 insertions(+), 338 deletions(-) create mode 100644 ilxutils/Tutorials/interlex_remotes_tutorial.ipynb diff --git a/ilxutils/Tutorials/interlex_remotes_tutorial.ipynb b/ilxutils/Tutorials/interlex_remotes_tutorial.ipynb new file mode 100644 index 00000000..f596fda2 --- /dev/null +++ b/ilxutils/Tutorials/interlex_remotes_tutorial.ipynb @@ -0,0 +1,742 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# INSTALL\n", + "- WARNING ::: ONLY DO ONCE \n", + " - update devconfig in ~/.config/pyontutils/devonfig.yaml\n", + " - scigraph_api: http://scigraph.scicrunch.io:9000/scigraph\n", + " - Install both pyontutils and ilxutils with pyontutils\n", + " - cd ~/git/pyontutils\n", + " - pip3 install --user --editable .\n", + " - cd ~/git/pyontutils/ilxutils/\n", + " - pip3 install --user --editable .\n", + " - Clone ontquery and install\n", + " - cd ~/git\n", + " - git clone https://github.com/tgbugs/ontquery.git\n", + " - cd ~/git/ontquery\n", + " - pip3 install --user --editable ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Maintainance\n", + "- update repos\n", + " - cd ~/git/pyontutils\n", + " - git pull \n", + " - cd ~/git/ontquery\n", + " - git pull" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Google Sheets Import\n", + "### Need pyontutils secrets.yaml setup first!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
termcurie
1abdominal cavityUBERON:0003684
2abdominal wallUBERON:0003697
3adipose tissueUBERON:0001013
4adult organismUBERON:0007023
5alimentary part of gastrointestinal systemUBERON:0005409
\n", + "
" + ], + "text/plain": [ + "0 term curie\n", + "1 abdominal cavity UBERON:0003684\n", + "2 abdominal wall UBERON:0003697\n", + "3 adipose tissue UBERON:0001013\n", + "4 adult organism UBERON:0007023\n", + "5 alimentary part of gastrointestinal system UBERON:0005409" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyontutils.sheets import Sheet\n", + "import pandas as pd\n", + "KEY_NAME = 'sparc-terms'\n", + "SHEET_NAME = 'Minimal information model(MIS)'\n", + "\n", + "class Brainstem(Sheet):\n", + " name = KEY_NAME # key name you gave the google sheet id value in secrets.yaml\n", + " sheet_name = SHEET_NAME # the actual sheet name on the google sheet\n", + " fetch_grid = True # meta data in self.grid that has detials like bolding\n", + "\n", + "brainstem = Brainstem()\n", + "df = pd.DataFrame(brainstem.raw_values)\n", + "df.columns = df.iloc[0]\n", + "df.drop(df.index[0], inplace=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['abdominal cavity',\n", + " 'abdominal wall',\n", + " 'adipose tissue',\n", + " 'adult organism',\n", + " 'alimentary part of gastrointestinal system',\n", + " 'arterial blood',\n", + " 'biceps femoris',\n", + " 'blood',\n", + " 'bolus of food',\n", + " 'brainstem']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(df.term)[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CSV or TSV EXAMPLE" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "import pandas as pd\n", + "\n", + "csv_df = pd.DataFrame('/path/to/csv')\n", + "tsv_df = pd.DataFrame('/path/to/tsv', delimiter='\\t')\n", + "\n", + "csv_df.head() # returns top 5 rows\n", + "csv_df.column_name # specific column name will return a Series which will act like a list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# QUERY DATABASES " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[OntTerm('HBA:3999', label='brain (hba)'),\n", + " OntTerm('FMA:50801', label='Brain'),\n", + " OntTerm('UBERON:0000955', label='brain'),\n", + " OntTerm('UBERON:6110636', label='adult cerebral ganglion'),\n", + " OntTerm('ILX:0101431', label='Brain'),\n", + " OntTerm('ILX:0101433', label='Brain Infarction'),\n", + " OntTerm('ILX:0506386', label='Brain Aneurysm'),\n", + " OntTerm('ILX:0433050', label='Brain Chemistry'),\n", + " OntTerm('ILX:0641746', label='alpha BRAIN'),\n", + " OntTerm('ILX:0726394', label='brain meninx'),\n", + " OntTerm('ILX:0729002', label='brain commissure'),\n", + " OntTerm('ILX:0101434', label='Brain Ischemia'),\n", + " OntTerm('ILX:0461406', label='Brain Death'),\n", + " OntTerm('ILX:0733041', label='brain endothelium')]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Give \"query\" a usable parameter to query the databases \n", + "from pyontutils.core import query # OntTerm\n", + "query(term='brain')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[OntTerm('ILX:0103358', label='DN1 neuron'),\n", + " OntTerm('ILX:0109525', label='Pupal DN1 period neuron')]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# similar entities will show\n", + "# default limit is 10\n", + "query(term='DN1 neuron', limit=2) " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[OntTerm('UBERON:0000955', label='brain')]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Faster and more accurate with curie/iri\n", + "query(curie='UBERON:0000955')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'prefix': 'UBERON',\n", + " 'suffix': '0000955',\n", + " 'orig_kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", + " 'curie_or_iri': None,\n", + " 'label': None,\n", + " 'term': None,\n", + " 'search': None,\n", + " 'validated': None,\n", + " 'query': None},\n", + " 'kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", + " 'curie_or_iri': None,\n", + " 'label': None,\n", + " 'term': None,\n", + " 'search': None,\n", + " 'validated': None,\n", + " 'query': None},\n", + " 'label': 'brain',\n", + " 'labels': ['brain'],\n", + " 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].',\n", + " 'synonyms': ['the brain',\n", + " 'synganglion',\n", + " 'suprasegmental structures',\n", + " 'suprasegmental levels of nervous system',\n", + " 'encephalon'],\n", + " 'deprecated': False,\n", + " 'predicates': {},\n", + " '_type': OntId('owl:Class'),\n", + " '_types': (OntId('owl:Class'),),\n", + " '_graph': None,\n", + " '_source': ,\n", + " 'validated': True,\n", + " '_query_result': QueryResult({'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955', 'curie': 'UBERON:0000955', 'label': 'brain', 'labels': ['brain'], 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].', 'synonyms': ['the brain', 'synganglion', 'suprasegmental structures', 'suprasegmental levels of nervous system', 'encephalon'], 'deprecated': False, 'predicates': {}, 'type': OntId('owl:Class'), 'types': (OntId('owl:Class'),), '_graph': None, 'source': })}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entity = query(curie='UBERON:0000955')[0]\n", + "# Full result attribute\n", + "vars(entity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DEBUGGING HINT\n", + "- 1 \"?\" at the end of a function or class will return its params, docstring, and pathing. \n", + "- 2 \"??\" returns the ENTIRE class/functions " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mCall signature:\u001b[0m\n", + "\u001b[0mquery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mterm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mprefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcategory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mabbrev\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msearch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msuffix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcurie\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0miri\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredicates\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mexclude_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdepth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdirection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'OUTGOING'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minclude_deprecated\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mType:\u001b[0m OntQueryCli\n", + "\u001b[0;31mString form:\u001b[0m \n", + "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/query.py\n", + "\u001b[0;31mDocstring:\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BONUS!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Concurrently search! (Run multiple query functions at the same time)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Futures compiled\n" + ] + }, + { + "data": { + "text/plain": [ + "[({'curie': 'UBERON:0000955'}, [OntTerm('UBERON:0000955', label='brain')]),\n", + " ({'curie': 'UBERON:6110636'},\n", + " [OntTerm('UBERON:6110636', label='adult cerebral ganglion')])]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pyontutils.utils import Async, deferred\n", + "from pyontutils.core import OntTerm, ixr, query\n", + "from typing import List, Tuple\n", + "\n", + "# query.setup()\n", + "\n", + "def queries(kwargs_list:List[dict]) -> List[Tuple[str, dict]]:\n", + " '''Asynchronously query databases to dramatically increase runtime un users end \n", + " \n", + " Examples:\n", + " >>> queries([{'term':'Brain'},])\n", + " [({'term': 'Brain'},\n", + " [OntTerm('HBA:3999', label='brain (hba)'),\n", + " OntTerm('FMA:50801', label='Brain'),\n", + " OntTerm('UBERON:0000955', label='brain'),\n", + " OntTerm('UBERON:6110636', label='adult cerebral ganglion')])]\n", + " >>> queries([{'curie':'UBERON:0000955'},])\n", + " [({'curie': 'UBERON:0000955'}, [OntTerm('UBERON:0000955', label='brain')])]\n", + " \n", + " Definitions:\n", + " kwargs == common name given to dictionary input for function\n", + " tuple == a list that you cannot update. \n", + " lambda == short-hand for single line function creation (func = lambda : ) \n", + " \n", + " Args:\n", + " kwargs_list (list): A list of dictionaries that are paramaters for the query function\n", + " \n", + " Returns:\n", + " List[tuple]: A list of tuples all being of (, ). \n", + " '''\n", + " # create a query function wrapper to return tuple\n", + " # kwargs -> (kwargs, query_result)\n", + " # We do this in case 2+ queries return the same results & the output WILL NOT have the same input order\n", + " gin = lambda kwargs: (kwargs, query(**kwargs))\n", + " # run each query instance at the same time\n", + " results = Async(use_nest_asyncio=True)(deferred(gin)(kwargs) for kwargs in kwargs_list)\n", + " return results \n", + "\n", + "queries([{'curie':'UBERON:0000955'}, {'curie':'UBERON:6110636'}])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyontutils.utils import Async, deferred\n", + "from pyontutils.core import OntTerm, ixr, query\n", + "from typing import List, Tuple\n", + "def queries(url_list:List[dict]) -> List[Tuple[str, dict]]:\n", + " def gin(url):\n", + " return requests.get(url).text\n", + " # run each query instance at the same time\n", + " results = Async(limit=5)(deferred(gin)(url) for url in url_list)\n", + " return results \n", + "list_tuples(url, html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Modifing TEST InterLex" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# TEST InterLex endpoints\n", + "from ilxutils.remotes import interlex_remote_test as ixrt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GET ENTITY" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '661544',\n", + " 'orig_uid': '34142',\n", + " 'uid': '34142',\n", + " 'orig_cid': '0',\n", + " 'cid': '0',\n", + " 'ilx': 'ilx_0738390',\n", + " 'label': 'Offical label',\n", + " 'type': 'term',\n", + " 'definition': 'official definition',\n", + " 'comment': 'helpful misc',\n", + " 'version': '3',\n", + " 'status': '0',\n", + " 'display_superclass': '1',\n", + " 'orig_time': '1564695195',\n", + " 'time': '1570826848',\n", + " 'synonyms': [{'id': '1776645',\n", + " 'tid': '661544',\n", + " 'literal': 'Encephalon',\n", + " 'type': '',\n", + " 'time': '1570826848',\n", + " 'version': '3'},\n", + " {'id': '1776646',\n", + " 'tid': '661544',\n", + " 'literal': 'Cerebro',\n", + " 'type': '',\n", + " 'time': '1570826848',\n", + " 'version': '3'}],\n", + " 'superclasses': [],\n", + " 'existing_ids': [{'id': '3885545',\n", + " 'tid': '661544',\n", + " 'curie': 'ILX:0738390',\n", + " 'iri': 'http://uri.interlex.org/base/ilx_0738390',\n", + " 'curie_catalog_id': '3885424',\n", + " 'version': '3',\n", + " 'time': '1570826848',\n", + " 'preferred': '1'}],\n", + " 'relationships': [],\n", + " 'mappings': [],\n", + " 'annotations': [],\n", + " 'ontologies': []}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ixrt.ilx_cli.get_entity('tmp_0738390')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ADD" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m\n", + "\u001b[0mixrt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_entity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msubThingOf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdefinition\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredicates\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m \n", + "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", + "\u001b[0;31mType:\u001b[0m method\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ixrt.add_entity?" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'official test label', 'labels': (), 'definition': 'definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" + ] + } + ], + "source": [ + "entity = dict(\n", + " label = 'offical label', # Can only one unique label per person\n", + " type = 'term', # OPTIONS: term, annotation, relationship, cde, fde, pde\n", + " definition = 'official definition',\n", + " comment = 'helpful misc',\n", + " # Optional\n", + " subThingOf = '', # WARNING ::: must have at last '', can be blank but please fill this in if you can. \n", + " synonyms = ['Encephalon', 'Cerebro'],\n", + " predicates = {} # annotations and/or relationships to add\n", + " # TODO: existing_ids will be an option later\n", + ")\n", + "result = ixrt.add_entity(**entity)\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# UPDATE" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m\n", + "\u001b[0mixrt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate_entity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0milx_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msubThingOf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdefinition\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_add\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_delete\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m \n", + "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", + "\u001b[0;31mType:\u001b[0m method\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ixrt.update_entity?" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m[2019-10-11 13:47:28,619]\u001b[0m - \u001b[32m INFO\u001b[0m - ontquery - \u001b[34minterlex_client.py:796 \u001b[0m - {'ilx_id': 'ILX:0738390', 'label': 'Offical label', 'type': 'term', 'definition': 'official definition', 'comment': 'helpful misc', 'superclass': '', 'synonyms': ['Encephalon', 'Cerebro']}\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'Offical label', 'labels': (), 'definition': 'official definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" + ] + } + ], + "source": [ + "from ilxutils.remotes import interlex_remote_test as ixrt\n", + "entity = dict(\n", + " ilx_id = 'ILX:0738390',\n", + " label = 'Offical label', # Can only one unique label per person\n", + " type = 'term', # OPTIONS: term, annotation, relationship, cde, fde, pde\n", + " definition = 'official definition',\n", + " comment = 'helpful misc',\n", + " # Optional\n", + " subThingOf = '', # WARNING ::: must have at last '', can be blank but please fill this in if you can. \n", + " synonyms = ['Encephalon', 'Cerebro'],\n", + " predicates_to_add = {}, # annotations and/or relationships to add\n", + " predicates_to_delete = {}, # annotations and/or relationships to del\n", + " # TODO: existing_ids will be an option later\n", + ")\n", + "result = ixrt.update_entity(**entity)\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PRODUCTION \n", + "# BE CAREFUL PLEASE :)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# PRODUCTION \n", + "from ilxutils.remotes import interlex_remote_production as ixr\n", + "# BE CAREFUL :)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ilxutils/ilx-playground.ipynb b/ilxutils/ilx-playground.ipynb index 7072d902..7f8afdfb 100644 --- a/ilxutils/ilx-playground.ipynb +++ b/ilxutils/ilx-playground.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 33, @@ -674,12 +681,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -6475,7 +6477,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.5" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/ilxutils/ilxutils/backup_ilx.py b/ilxutils/ilxutils/backup_ilx.py index 20006cf1..a4b102f7 100644 --- a/ilxutils/ilxutils/backup_ilx.py +++ b/ilxutils/ilxutils/backup_ilx.py @@ -1,15 +1,9 @@ from pathlib import Path as p -from interlex_sql import IlxSql -# from tools import create_pickle -import pickle +from ilxutils.interlex_sql import IlxSql +from ilxutils.tools import create_pickle import os -def create_pickle(data, outfilename): - with open(outfilename, 'wb') as outfile: - pickle.dump(data, outfile) - - sql = IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_PRODUCTION')) diff --git a/ilxutils/ilxutils/interlex_sql.py b/ilxutils/ilxutils/interlex_sql.py index d1badb67..256a0f83 100755 --- a/ilxutils/ilxutils/interlex_sql.py +++ b/ilxutils/ilxutils/interlex_sql.py @@ -1,84 +1,33 @@ -from collections import defaultdict -import os from pathlib import Path -from typing import Union, Dict, Tuple, List - import pandas as pd from sqlalchemy import create_engine, inspect, Table, Column - +from collections import defaultdict from ilxutils.tools import light_degrade, open_pickle, create_pickle - - -TERMSC_BACKUP = 'ilx_db_terms_complete_backup.pickle' -TERMS_BACKUP = 'ilx_db_terms_backup.pickle' -ANNOS_BACKUP = 'ilx_db_annotations_backup.pickle' -RELAS_BACKUP = 'ilx_db_relationships_backup.pickle' -SUPER_BACKUP = 'ilx_db_superclasses_backup.pickle' -SYNOS_BACKUP = 'ilx_db_synonyms_backup.pickle' -EXIDS_BACKUP = 'ilx_db_existing_ids_backup.pickle' - - -def pathing(path, check_path=False): - """ Extract absolute path from shortened or relative paths. - - :param str path: path to file or folder. - - :examples: - >>>pathing('~/shortened.filepath') - >>>pathing('../relative.filepath') - >>>pathing('relative.filepath') - >>>pathing('/home/absoulte/filepath') - """ - path = Path(path) - if str(path).startswith('~'): - path = path.expanduser() - else: - path = path.resolve() - - if check_path: - if not path.is_file() and path.is_dir(): - raise ValueError(f'{path} does not exit') - - return path - - -class IlxSql: - - def __init__(self, - db_url: str, - from_backup: bool = False, - pre_load: bool = False, - backups_folder: str = '~/.interlex_backups'): - - self.engine = create_engine(db_url) +import os +#ELASTIC = 'https://5f86098ac2b28a982cebf64e82db4ea2.us-west-2.aws.found.io:9243/interlex/term/' +TERMS_COMPLETE_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_terms_complete_backup.pickle' +TERMS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_terms_backup.pickle' +ANNOS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_annotations_backup.pickle' +RELAS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_relationships_backup.pickle' +SUPER_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_superclasses_backup.pickle' +SYNOS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_synonyms_backup.pickle' +EXIDS_BACKUP_PATH = Path.home()/'Dropbox/interlex_backups/ilx_db_ex_backup.pickle' + + +class IlxSql(): + + def __init__(self, db_url, pre_load=False, from_backup=False): + self.db_url = db_url + self.engine = create_engine(self.db_url) + self.local_degrade = lambda string: string.lower().strip() # current degrade of choice for sql self.from_backup = from_backup - - self.save_folder = pathing(backups_folder) - try: - self.save_folder.mkdir() - except: - pass - - self.terms = pd.DataFrame - self.superclasses = pd.DataFrame - self.annotations = pd.DataFrame - self.existing_ids = pd.DataFrame - self.relationships = pd.DataFrame - self.synonyms = pd.DataFrame - - # Auto load tables from backup - if pre_load: - self.pre_loader() - - - def pre_loader(self): - # self.terms_complete = self.get_terms_complete() if pre_load else pd.DataFrame - self.terms = self.get_terms() - # self.superclasses = self.get_superclasses() - self.existing_ids = self.get_existing_ids() - self.synonyms = self.get_synonyms() - # self.relationships = self.get_relationships() - # self.annotations = self.get_annotations() + self.terms_complete = self.get_terms_complete() if pre_load else pd.DataFrame + self.terms = self.get_terms() if pre_load else pd.DataFrame + self.superclasses = self.get_superclasses if pre_load else pd.DataFrame + self.annotations = self.get_annotations() if pre_load else pd.DataFrame + self.existing_ids = self.get_existing_ids() if pre_load else pd.DataFrame + self.relationships = self.get_relationships() if pre_load else pd.DataFrame + self.synonyms = self.get_synonyms() if pre_load else pd.DataFrame def fetch_terms_complete(self): if self.terms_complete.empty: @@ -117,24 +66,29 @@ def fetch_superclasses(self): def get_terms(self): ''' GROUP BY is a shortcut to only getting the first in every list of group ''' + if not self.terms.empty: + return self.terms if self.from_backup: - self.terms = open_pickle(self.save_folder / TERMS_BACKUP) + self.terms = open_pickle(TERMS_BACKUP_PATH) return self.terms - sql_query = """ - SELECT t.id as tid, t.ilx, t.label, t.definition, t.type, t.comment, t.version, t.uid, t.cid, t.time, t.status + engine = create_engine(self.db_url) + data = """ + SELECT t.id as tid, t.ilx, t.label, t.definition, t.type, t.comment, t.version, t.uid, t.time FROM terms t GROUP BY t.ilx - HAVING t.status = '0' """ - self.terms = pd.read_sql(sql_query, self.engine) - create_pickle(self.terms, self.save_folder / TERMS_BACKUP) + self.terms = pd.read_sql(data, engine) + create_pickle(self.terms, TERMS_BACKUP_PATH) return self.terms def get_annotations(self): + if not self.annotations: + return self.fetch_annotations() if self.from_backup: - self.annotations = open_pickle(self.save_folder / ANNOS_BACKUP) + self.annotations = open_pickle(ANNOS_BACKUP_PATH) return self.annotations - sql_query = """ + engine = create_engine(self.db_url) + data = """ SELECT ta.tid, ta.annotation_tid as annotation_type_tid, t1.ilx as term_ilx, t2.ilx as annotation_type_ilx, @@ -145,76 +99,79 @@ def get_annotations(self): SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) AS t1 ON ta.tid=t1.id JOIN ( SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) AS t2 ON ta.annotation_tid=t2.id """ - self.annotations = pd.read_sql(sql_query, self.engine) - create_pickle(self.annotations, self.save_folder / ANNOS_BACKUP) + self.annotations = pd.read_sql(data, engine) + create_pickle(self.annotations, ANNOS_BACKUP_PATH) return self.annotations def get_existing_ids(self): + if not self.existing_ids.empty: + return self.existing_ids if self.from_backup: - self.existing_ids = open_pickle(self.save_folder / EXIDS_BACKUP) + self.existing_ids = open_pickle(EXIDS_BACKUP_PATH) return self.existing_ids - sql_query = """ - SELECT tei.tid, tei.curie, tei.iri, tei.preferred, t.ilx, t.label, t.definition, t.status + engine = create_engine(self.db_url) + data = """ + SELECT tei.tid, tei.curie, tei.iri, tei.preferred, t.ilx, t.label, t.definition FROM ( SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) as t JOIN term_existing_ids AS tei ON t.id = tei.tid """ - self.existing_ids = pd.read_sql(sql_query, self.engine) - create_pickle(self.existing_ids, self.save_folder / EXIDS_BACKUP) + self.existing_ids = pd.read_sql(data, engine) + create_pickle(self.existing_ids, EXIDS_BACKUP_PATH) return self.existing_ids def get_relationships(self): + if not self.relationships.empty: + return self.relationships if self.from_backup: - self.relationships = open_pickle(self.save_folder / RELAS_BACKUP) + self.relationships = open_pickle(RELAS_BACKUP_PATH) return self.relationships - sql_query = """ + engine = create_engine(self.db_url) + data = """ SELECT - t1.id as term1_tid, t1.ilx AS term1_ilx, t1.type as term1_type, t1.label as term1_label, - t2.id as term2_tid, t2.ilx AS term2_ilx, t2.type as term2_type, t2.label as term2_label, + t1.id as term1_tid, t1.ilx AS term1_ilx, t1.type as term1_type, + t2.id as term2_tid, t2.ilx AS term2_ilx, t2.type as term2_type, t3.id as relationship_tid, t3.ilx AS relationship_ilx, t3.label as relationship_label FROM term_relationships AS tr JOIN ( SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) t1 ON t1.id = tr.term1_id JOIN ( SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) AS t2 ON t2.id = tr.term2_id JOIN ( SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) AS t3 ON t3.id = tr.relationship_tid """ - self.relationships = pd.read_sql(sql_query, self.engine) - create_pickle(self.relationships, self.save_folder / RELAS_BACKUP) + self.relationships = pd.read_sql(data, engine) + create_pickle(self.relationships, RELAS_BACKUP_PATH) return self.relationships def get_superclasses(self): + if not self.superclasses.empty: + return self.superclasses if self.from_backup: - self.superclasses = open_pickle(self.save_folder / SUPER_BACKUP) + self.superclasses = open_pickle(SUPER_BACKUP_PATH) return self.superclasses - sql_query = """ + engine = create_engine(self.db_url) + data = """ SELECT ts.tid, ts.superclass_tid, t1.label as term_label, t1.ilx as term_ilx, @@ -224,49 +181,50 @@ def get_superclasses(self): SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) as t1 ON t1.id = ts.tid JOIN ( SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) AS t2 ON t2.id = ts.superclass_tid """ - self.superclasses = pd.read_sql(sql_query, self.engine) - create_pickle(self.superclasses, self.save_folder / SUPER_BACKUP) + self.superclasses = pd.read_sql(data, engine) + create_pickle(self.superclasses, SUPER_BACKUP_PATH) return self.superclasses def get_synonyms(self): + if not self.synonyms.empty: + return self.synonyms if self.from_backup: - self.synonyms = open_pickle(self.save_folder / SYNOS_BACKUP) + self.synonyms = open_pickle(SYNOS_BACKUP_PATH) return self.synonyms - sql_query = """ + engine = create_engine(self.db_url) + data = """ SELECT ts.tid as tid, t.ilx, ts.literal, ts.type FROM term_synonyms AS ts JOIN ( SELECT * FROM terms GROUP BY terms.ilx - HAVING terms.status = '0' ) AS t WHERE ts.tid=t.id """ - self.synonyms = pd.read_sql(sql_query, self.engine) - create_pickle(self.synonyms, self.save_folder / SYNOS_BACKUP) + self.synonyms = pd.read_sql(data, engine) + create_pickle(self.synonyms, SYNOS_BACKUP_PATH) return self.synonyms def get_terms_complete(self) -> pd.DataFrame: ''' Gets complete entity data like term/view ''' + if not self.terms_complete.empty: + return self.terms_complete if self.from_backup: - self.terms_complete = open_pickle(self.save_folder / TERMSC_BACKUP) + self.terms_complete = open_pickle(TERMS_COMPLETE_BACKUP_PATH) return self.terms_complete ilx2synonyms = self.get_ilx2synonyms() ilx2existing_ids = self.get_ilx2existing_ids() ilx2annotations = self.get_ilx2annotations() - ilx2relationships = self.get_ilx2relationships() ilx2superclass = self.get_ilx2superclass() ilx_complete = [] header = ['Index'] + list(self.fetch_terms().columns) @@ -278,49 +236,50 @@ def get_terms_complete(self) -> pd.DataFrame: row['superclass'] = ilx2superclass.get(row['ilx']) ilx_complete.append(row) terms_complete = pd.DataFrame(ilx_complete) - create_pickle(terms_complete, self.save_folder / TERMSC_BACKUP) + create_pickle(terms_complete, TERMS_COMPLETE_BACKUP_PATH) return terms_complete - def get_label2id(self, clean: object = None) -> Dict[str, list]: - if not clean: - clean = lambda string: string.lower().strip() - label2id = defaultdict(list) - [label2id[clean(row.label)].append(row.id) for row in self.fetch_terms().itertuples()] - return label2id - - def get_label2ilx(self, clean: object = None) -> Dict[str, list]: - if not clean: - clean = lambda string: string.lower().strip() - label2ilx = defaultdict(list) - [label2ilx[clean(row.label)].append(row.ilx) for row in self.fetch_terms().itertuples()] - [label2ilx[clean(row.literal)].append(row.ilx) for row in self.fetch_synonyms().itertuples()] - return label2ilx - - # def get_label2rows(self): - # self.terms = self.fetch_terms() - # visited = {} - # label2rows = defaultdict(list) - # header = ['Index'] + list(self.terms.columns) - # for row in self.terms.itertuples(): - # row = {header[i]: val for i, val in enumerate(row)} - # label = row['label'].lower().strip() - # if not visited.get((label, row['type'], row['ilx'])): - # label2rows[label].append(row) - # visited[(label, row['type'], row['ilx'])] = True - # return label2rows - - def get_label2rows(self, clean: object = None) -> Dict[str, list]: - if not clean: - clean = lambda string: string.lower().strip() - label2ilx = defaultdict(list) - [label2ilx[clean(row.label)].append(row) for row in self.fetch_terms().itertuples()] - [label2ilx[clean(row.literal)].append(row) for row in self.fetch_synonyms().itertuples()] - return label2ilx - - def get_ilx2synonyms(self) -> defaultdict(list): - ilx2synonyms = defaultdict(list) - [ilx2synonyms[row.ilx].append(row.literal) for row in self.fetch_synonyms().itertuples()] - return ilx2synonyms + def get_label2id(self): + self.terms = self.fetch_terms() + visited = {} + label_to_id = defaultdict(lambda: defaultdict(list)) + for row in self.terms.itertuples(): + label = self.local_degrade(row.label) + if not visited.get((label, row.type, row.ilx)): + if row.type == 'term': + label_to_id[label]['term'].append(int(row.id)) + visited[(label, row.type, row.ilx)] = True + elif row.type == 'cde': + label_to_id[label]['cde'].append(int(row.id)) + visited[(label, row.type, row.ilx)] = True + elif row.type == 'fde': + label_to_id[label]['fde'].append(int(row.id)) + visited[(label, row.type, row.ilx)] = True + return label_to_id + + def get_label2ilxs(self): + self.terms = self.fetch_terms() + visited = {} + label_to_ilx = defaultdict(list) + for row in self.terms.itertuples(): + label = self.local_degrade(row.label) + if not visited.get((label, row.type, row.ilx)): + label_to_ilx[label].append(str(row.ilx)) + visited[(label, row.type, row.ilx)] = True + return label_to_ilx + + def get_label2rows(self): + self.terms_complete = self.fetch_terms_complete() + visited = {} + label2rows = defaultdict(list) + header = ['Index'] + list(self.terms_complete.columns) + for row in self.terms_complete.itertuples(): + row = {header[i]:val for i, val in enumerate(row)} + label = self.local_degrade(row['label']) + if not visited.get((label, row['type'], row['ilx'])): + label2rows[label].append(row) + visited[(label, row['type'], row['ilx'])] = True + return label2rows def get_definition2rows(self): self.terms = self.fetch_terms() @@ -328,8 +287,8 @@ def get_definition2rows(self): definition2rows = defaultdict(list) header = ['Index'] + list(self.terms.columns) for row in self.terms.itertuples(): - row = {header[i]: val for i, val in enumerate(row)} - definition = row['definition'].lower().strip() + row = {header[i]:val for i, val in enumerate(row)} + definition = self.local_degrade(row['definition']) if not definition or definition == ' ': continue if not visited.get((definition, row['type'], row['ilx'])): @@ -387,12 +346,6 @@ def get_tid2annotations(self, clean:bool=True): tid2annotations[row['tid']].append(row) return tid2annotations - def get_ilx2relationships(self): - ilx2relationships = defaultdict(list) - header = ['Index'] + list(self.fetch_relationships().columns) - for row in self.fetch_relationships().itertuples(): - row = {header[i]:val for i, val in enumerate(row)} - def get_ilx2annotations(self, clean:bool=True): ''' clean: for list of literals only ''' ilx2annotations = defaultdict(list) @@ -490,19 +443,19 @@ def get_fragment2rows(self): return fragement2rows def show_tables(self): - sql_query = "SHOW tables;" - return pd.read_sql(sql_query, self.engine) + data = "SHOW tables;" + return pd.read_sql(data, self.engine) def get_table(self, tablename, limit=5): - sql_query = """ + data = """ SELECT * FROM {tablename} LIMIT {limit} """.format(tablename=tablename, limit=limit) - return pd.read_sql(sql_query, self.engine) + return pd.read_sql(data, self.engine) - def get_custom(self, sql_query): - return pd.read_sql(sql_query, self.engine) + def get_custom(self, data): + return pd.read_sql(data, self.engine) def main(): diff --git a/ilxutils/ilxutils/nltklib.py b/ilxutils/ilxutils/nltklib.py index 27acae74..da5c9918 100644 --- a/ilxutils/ilxutils/nltklib.py +++ b/ilxutils/ilxutils/nltklib.py @@ -137,8 +137,7 @@ def sentence_similarity(sentence1, sentence2, ignore_integers=False): tokens2 = word_tokenize(sentence2) tokens1 = clean_tokens(tokens1, ignore_integers) tokens2 = clean_tokens(tokens2, ignore_integers) - print(tokens1) - print(tokens2) + # tag sentence1 = pos_tag(tokens1) sentence2 = pos_tag(tokens2) diff --git a/ilxutils/ilxutils/ontopandas.py b/ilxutils/ilxutils/ontopandas.py index f777863b..faf2581b 100644 --- a/ilxutils/ilxutils/ontopandas.py +++ b/ilxutils/ilxutils/ontopandas.py @@ -43,12 +43,10 @@ class OntoPandas: def __init__(self, obj: Union[rdflib.graph.Graph, str], query:str=defaultquery, - curie:bool=True, qnamed:bool=False, str_vals:bool=False,) -> None: self.query = query self.qnamed = qnamed - self.curie = curie self.str_vals = str_vals self.g = obj # could be path self.path = obj # could be graph @@ -272,10 +270,6 @@ def get_sparql_dataframe( self ): df = df.where((pd.notnull(df)), None) # default Null is fricken Float NaN df = df.reset_index().rename(columns={'index':'iri'}) - - if self.curie: - df['curie'] = df.apply(lambda row: self.qname(row.iri), axis = 1) - return df diff --git a/ilxutils/ilxutils/remotes.py b/ilxutils/ilxutils/remotes.py index 979ae58f..4137adb1 100644 --- a/ilxutils/ilxutils/remotes.py +++ b/ilxutils/ilxutils/remotes.py @@ -2,20 +2,19 @@ from pyontutils.core import OntTerm import os -def remote(server=''): +TEST = 'https://test3.scicrunch.org/api/1/' +PRODUCTION = 'https://scicrunch.org/api/1/' - # Request interlex remote (scigraph is also an option for plugins) - InterLexRemote = oq.plugin.get('InterLex') +InterLexRemote = oq.plugin.get('InterLex') +interlex_remote_production = InterLexRemote( + # When ready, should be changed to 'https://scicrunch.org/api/1/' for production + apiEndpoint = PRODUCTION +) +interlex_remote_production.setup(instrumented=OntTerm) - if server: - server = server if server.endswith('.') else server + '.' - endpoint = f'https://{server}scicrunch.org/api/1/' - - # - interlex_remote = InterLexRemote() - - # setup inheritance classes - interlex_remote.apiEndpoint = endpoint - interlex_remote.setup(instrumented=OntTerm) - - return interlex_remote +# InterLexRemote = oq.plugin.get('InterLex') +# interlex_remote_test = InterLexRemote( +# # When ready, should be changed to 'https://scicrunch.org/api/1/' for production +# apiEndpoint = TEST +# ) +# interlex_remote_test.setup(instrumented=OntTerm) diff --git a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb index 5a27e35a..f596fda2 100644 --- a/ilxutils/tutorials/interlex_remotes_tutorial.ipynb +++ b/ilxutils/tutorials/interlex_remotes_tutorial.ipynb @@ -42,9 +42,78 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
termcurie
1abdominal cavityUBERON:0003684
2abdominal wallUBERON:0003697
3adipose tissueUBERON:0001013
4adult organismUBERON:0007023
5alimentary part of gastrointestinal systemUBERON:0005409
\n", + "
" + ], + "text/plain": [ + "0 term curie\n", + "1 abdominal cavity UBERON:0003684\n", + "2 abdominal wall UBERON:0003697\n", + "3 adipose tissue UBERON:0001013\n", + "4 adult organism UBERON:0007023\n", + "5 alimentary part of gastrointestinal system UBERON:0005409" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from pyontutils.sheets import Sheet\n", "import pandas as pd\n", @@ -121,17 +190,29 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[OntTerm('UBERON:0000955', label='brain'),\n", - " OntTerm('UBERON:6110636', label='adult cerebral ganglion')]" + "[OntTerm('HBA:3999', label='brain (hba)'),\n", + " OntTerm('FMA:50801', label='Brain'),\n", + " OntTerm('UBERON:0000955', label='brain'),\n", + " OntTerm('UBERON:6110636', label='adult cerebral ganglion'),\n", + " OntTerm('ILX:0101431', label='Brain'),\n", + " OntTerm('ILX:0101433', label='Brain Infarction'),\n", + " OntTerm('ILX:0506386', label='Brain Aneurysm'),\n", + " OntTerm('ILX:0433050', label='Brain Chemistry'),\n", + " OntTerm('ILX:0641746', label='alpha BRAIN'),\n", + " OntTerm('ILX:0726394', label='brain meninx'),\n", + " OntTerm('ILX:0729002', label='brain commissure'),\n", + " OntTerm('ILX:0101434', label='Brain Ischemia'),\n", + " OntTerm('ILX:0461406', label='Brain Death'),\n", + " OntTerm('ILX:0733041', label='brain endothelium')]" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -144,16 +225,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "[OntTerm('ILX:0103358', label='DN1 neuron'),\n", + " OntTerm('ILX:0109525', label='Pupal DN1 period neuron')]" ] }, - "execution_count": 11, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -161,12 +243,12 @@ "source": [ "# similar entities will show\n", "# default limit is 10\n", - "query(term='brain', limit=10, prefix=('ILX')) " + "query(term='DN1 neuron', limit=2) " ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -175,7 +257,7 @@ "[OntTerm('UBERON:0000955', label='brain')]" ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -187,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -195,6 +277,20 @@ "text/plain": [ "{'prefix': 'UBERON',\n", " 'suffix': '0000955',\n", + " 'orig_kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", + " 'curie_or_iri': None,\n", + " 'label': None,\n", + " 'term': None,\n", + " 'search': None,\n", + " 'validated': None,\n", + " 'query': None},\n", + " 'kwargs': {'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955',\n", + " 'curie_or_iri': None,\n", + " 'label': None,\n", + " 'term': None,\n", + " 'search': None,\n", + " 'validated': None,\n", + " 'query': None},\n", " 'label': 'brain',\n", " 'labels': ['brain'],\n", " 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].',\n", @@ -208,12 +304,12 @@ " '_type': OntId('owl:Class'),\n", " '_types': (OntId('owl:Class'),),\n", " '_graph': None,\n", - " '_source': ,\n", + " '_source': ,\n", " 'validated': True,\n", - " '_query_result': QueryResult({'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955', 'curie': 'UBERON:0000955', 'label': 'brain', 'labels': ['brain'], 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].', 'synonyms': ['the brain', 'synganglion', 'suprasegmental structures', 'suprasegmental levels of nervous system', 'encephalon'], 'deprecated': False, 'predicates': {}, 'type': OntId('owl:Class'), 'types': (OntId('owl:Class'),), '_graph': None, 'source': })}" + " '_query_result': QueryResult({'iri': 'http://purl.obolibrary.org/obo/UBERON_0000955', 'curie': 'UBERON:0000955', 'label': 'brain', 'labels': ['brain'], 'definition': 'The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].', 'synonyms': ['the brain', 'synganglion', 'suprasegmental structures', 'suprasegmental levels of nervous system', 'encephalon'], 'deprecated': False, 'predicates': {}, 'type': OntId('owl:Class'), 'types': (OntId('owl:Class'),), '_graph': None, 'source': })}" ] }, - "execution_count": 13, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -235,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -259,12 +355,9 @@ "\u001b[0;34m\u001b[0m \u001b[0mdirection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'OUTGOING'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0minclude_deprecated\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0minclude_supers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0minclude_all_services\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mraw\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mType:\u001b[0m OntQueryCli\n", - "\u001b[0;31mString form:\u001b[0m \n", + "\u001b[0;31mString form:\u001b[0m \n", "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/query.py\n", "\u001b[0;31mDocstring:\u001b[0m \n" ] @@ -293,7 +386,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -311,7 +404,7 @@ " [OntTerm('UBERON:6110636', label='adult cerebral ganglion')])]" ] }, - "execution_count": 16, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -352,12 +445,30 @@ " # We do this in case 2+ queries return the same results & the output WILL NOT have the same input order\n", " gin = lambda kwargs: (kwargs, query(**kwargs))\n", " # run each query instance at the same time\n", - " results = Async()(deferred(gin)(kwargs) for kwargs in kwargs_list)\n", + " results = Async(use_nest_asyncio=True)(deferred(gin)(kwargs) for kwargs in kwargs_list)\n", " return results \n", "\n", "queries([{'curie':'UBERON:0000955'}, {'curie':'UBERON:6110636'}])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyontutils.utils import Async, deferred\n", + "from pyontutils.core import OntTerm, ixr, query\n", + "from typing import List, Tuple\n", + "def queries(url_list:List[dict]) -> List[Tuple[str, dict]]:\n", + " def gin(url):\n", + " return requests.get(url).text\n", + " # run each query instance at the same time\n", + " results = Async(limit=5)(deferred(gin)(url) for url in url_list)\n", + " return results \n", + "list_tuples(url, html)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -367,13 +478,12 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# TEST InterLex endpoints\n", - "from ilxutils.remotes import remote\n", - "ixrt = remote(server='test3')" + "from ilxutils.remotes import interlex_remote_test as ixrt" ] }, { @@ -385,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -397,35 +507,35 @@ " 'orig_cid': '0',\n", " 'cid': '0',\n", " 'ilx': 'ilx_0738390',\n", - " 'label': 'official test label',\n", + " 'label': 'Offical label',\n", " 'type': 'term',\n", - " 'definition': 'definition',\n", + " 'definition': 'official definition',\n", " 'comment': 'helpful misc',\n", - " 'version': '2',\n", + " 'version': '3',\n", " 'status': '0',\n", " 'display_superclass': '1',\n", " 'orig_time': '1564695195',\n", - " 'time': '1564695333',\n", - " 'synonyms': [{'id': '1776589',\n", + " 'time': '1570826848',\n", + " 'synonyms': [{'id': '1776645',\n", " 'tid': '661544',\n", " 'literal': 'Encephalon',\n", " 'type': '',\n", - " 'time': '1564695333',\n", - " 'version': '2'},\n", - " {'id': '1776590',\n", + " 'time': '1570826848',\n", + " 'version': '3'},\n", + " {'id': '1776646',\n", " 'tid': '661544',\n", " 'literal': 'Cerebro',\n", " 'type': '',\n", - " 'time': '1564695333',\n", - " 'version': '2'}],\n", + " 'time': '1570826848',\n", + " 'version': '3'}],\n", " 'superclasses': [],\n", - " 'existing_ids': [{'id': '3885425',\n", + " 'existing_ids': [{'id': '3885545',\n", " 'tid': '661544',\n", " 'curie': 'ILX:0738390',\n", " 'iri': 'http://uri.interlex.org/base/ilx_0738390',\n", " 'curie_catalog_id': '3885424',\n", - " 'version': '2',\n", - " 'time': '1564695334',\n", + " 'version': '3',\n", + " 'time': '1570826848',\n", " 'preferred': '1'}],\n", " 'relationships': [],\n", " 'mappings': [],\n", @@ -433,7 +543,7 @@ " 'ontologies': []}" ] }, - "execution_count": 21, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -451,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -466,11 +576,9 @@ "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mpredicates\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mexisting_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mDocstring:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services/interlex.py\n", + "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", "\u001b[0;31mType:\u001b[0m method\n" ] }, @@ -484,14 +592,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'official test label', 'labels': (), 'definition': 'definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" + "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'official test label', 'labels': (), 'definition': 'definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" ] } ], @@ -520,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -536,20 +644,10 @@ "\u001b[0;34m\u001b[0m \u001b[0msynonyms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mcomment\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_add\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0madd_existing_ids\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdelete_existing_ids\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m \u001b[0mpredicates_to_delete\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mcid\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m\n", - "Update existing entity.\n", - "\n", - ":param List[dict] add_existing_ids: iris and curies to be added to entity.\n", - ":param List[dict] delete_existing_ids: iris and curies to be deleted from entity.\n", - "\n", - ">>>update_entity(add_existing_ids=[{'ilx_id':'ilx_1234567', 'iri':'http://abc.org/abc_123', 'curie':'ABC:123'}])\n", - ">>>update_entity(delete_existing_ids=[{'ilx_id':'ilx_1234567', 'iri':'http://abc.org/abc_123', 'curie':'ABC:123'}])\n", - "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services/interlex.py\n", + "\u001b[0;31mDocstring:\u001b[0m \n", + "\u001b[0;31mFile:\u001b[0m ~/Dropbox/git/ontquery/ontquery/plugins/services.py\n", "\u001b[0;31mType:\u001b[0m method\n" ] }, @@ -563,27 +661,26 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m[2020-04-10 13:25:53,802]\u001b[0m - \u001b[32m INFO\u001b[0m - ontquery - \u001b[34minterlex_client.py:962 \u001b[0m - {'ilx_id': 'ILX:0738390', 'label': 'Offical label', 'type': 'term', 'definition': 'official definition', 'comment': 'helpful misc', 'superclass': '', 'synonyms': ['Encephalon', 'Cerebro'], 'add_existing_ids': None, 'delete_existing_ids': None, 'status': '0', 'cid': None}\u001b[0m\n" + "\u001b[32m[2019-10-11 13:47:28,619]\u001b[0m - \u001b[32m INFO\u001b[0m - ontquery - \u001b[34minterlex_client.py:796 \u001b[0m - {'ilx_id': 'ILX:0738390', 'label': 'Offical label', 'type': 'term', 'definition': 'official definition', 'comment': 'helpful misc', 'superclass': '', 'synonyms': ['Encephalon', 'Cerebro']}\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'Offical label', 'labels': (), 'definition': 'official definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" + "QueryResult({'iri': 'http://uri.interlex.org/base/ilx_0738390', 'curie': 'ILX:0738390', 'label': 'Offical label', 'labels': (), 'definition': 'official definition', 'synonyms': ('Encephalon', 'Cerebro'), 'deprecated': None, 'predicates': {'comment': 'helpful misc'}, 'type': None, 'types': (), '_graph': None, 'source': })\n" ] } ], "source": [ - "from ilxutils.remotes import remote\n", - "ixrt = remote(server='test3')\n", + "from ilxutils.remotes import interlex_remote_test as ixrt\n", "entity = dict(\n", " ilx_id = 'ILX:0738390',\n", " label = 'Offical label', # Can only one unique label per person\n", @@ -611,74 +708,14 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# PRODUCTION \n", - "from ilxutils.remotes import remote \n", - "ixr = remote()\n", + "from ilxutils.remotes import interlex_remote_production as ixr\n", "# BE CAREFUL :)" ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '661544',\n", - " 'orig_uid': '34142',\n", - " 'uid': '34142',\n", - " 'orig_cid': '0',\n", - " 'cid': '0',\n", - " 'ilx': 'ilx_0738390',\n", - " 'label': 'Offical label',\n", - " 'type': 'term',\n", - " 'definition': 'official definition',\n", - " 'comment': 'helpful misc',\n", - " 'version': '3',\n", - " 'status': '0',\n", - " 'display_superclass': '1',\n", - " 'orig_time': '1564695195',\n", - " 'time': '1586550353',\n", - " 'synonyms': [{'id': '1845765',\n", - " 'tid': '661544',\n", - " 'literal': 'Encephalon',\n", - " 'type': '',\n", - " 'time': '1586550353',\n", - " 'version': '3'},\n", - " {'id': '1845766',\n", - " 'tid': '661544',\n", - " 'literal': 'Cerebro',\n", - " 'type': '',\n", - " 'time': '1586550353',\n", - " 'version': '3'}],\n", - " 'superclasses': [],\n", - " 'existing_ids': [{'id': '4972084',\n", - " 'tid': '661544',\n", - " 'curie': 'ILX:0738390',\n", - " 'iri': 'http://uri.interlex.org/base/ilx_0738390',\n", - " 'curie_catalog_id': '3885424',\n", - " 'version': '3',\n", - " 'time': '1586550353',\n", - " 'preferred': '1'}],\n", - " 'relationships': [],\n", - " 'mappings': [],\n", - " 'annotations': [],\n", - " 'ontologies': []}" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ixrt.ilx_cli.get_entity('ilx_0738390')" - ] } ], "metadata": { @@ -697,7 +734,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.3" } }, "nbformat": 4, From 107430cd294b5074141a0b5279e4a2767f39a903 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Thu, 1 Oct 2020 16:36:06 -0700 Subject: [PATCH 13/16] - --- pyontutils/ontutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyontutils/ontutils.py b/pyontutils/ontutils.py index 1013593a..e1eacde6 100755 --- a/pyontutils/ontutils.py +++ b/pyontutils/ontutils.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3.7 #!/usr/bin/env pypy3 -from pyontutils.config import auth, devconfig +from pyontutils.config import auth __doc__ = f"""Common commands for ontology processes. Also old ontology refactors to run in the root ttl folder. From 4ac60825ca907aa91796a2b1e7056a636826c810 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Thu, 1 Oct 2020 16:37:41 -0700 Subject: [PATCH 14/16] - --- ilxutils/ilxutils/elastic_search.py | 159 ----------- ilxutils/ilxutils/interlex_sanity_checks.py | 4 - ilxutils/ilxutils/scicrunch_session.py | 187 ------------- ilxutils/ilxutils/sparql.ipynb | 296 -------------------- ilxutils/ilxutils/sql.py | 16 -- 5 files changed, 662 deletions(-) delete mode 100644 ilxutils/ilxutils/elastic_search.py delete mode 100644 ilxutils/ilxutils/interlex_sanity_checks.py delete mode 100644 ilxutils/ilxutils/scicrunch_session.py delete mode 100644 ilxutils/ilxutils/sparql.ipynb delete mode 100644 ilxutils/ilxutils/sql.py diff --git a/ilxutils/ilxutils/elastic_search.py b/ilxutils/ilxutils/elastic_search.py deleted file mode 100644 index a600e916..00000000 --- a/ilxutils/ilxutils/elastic_search.py +++ /dev/null @@ -1,159 +0,0 @@ -""" To quick test Elastic to make sure it's behaving normally - -Usage: - elastic_search.py [-h | --help] - elastic_search.py [-v | --version] - elastic_search.py [--username=] [--password=] - -Options: - -h, --help Display this help message - -v, --version Current version of file - -u, --username= Username for Elastic [default: keys/elastic_username.txt] - -p, --password= Password for Elastic [default: keys/elastic_password.txt] -""" -from docopt import docopt -import pandas as pd -from pathlib import Path as p -import requests as r -from sys import exit -import asyncio -from aiohttp import ClientSession, TCPConnector, BasicAuth -import math as m -from IPython import embed -from ilxutils.tools import * -from ilxutils.args_reader import doc2args -VERSION = '0.0.2' - -# TODO: will have to asyncio it later -# TODO: basic auth search a list of ids to get it ready for concurrent searchs; crawl str or list < 10 elements - - -class ElasticSearch: - def __init__(self, user, password): - self.username, self.password = self.clean_auth(user, password) - self.base_url = 'https://5f86098ac2b28a982cebf64e82db4ea2.us-west-2.aws.found.io:9243/interlex' - - def clean_auth(self, *args): - return tuple(map(open_txt, filter(is_file, map(fix_path, args)))) - - def get(self, url, data=None, auth=None): - req = r.get(url, data=data, auth=auth) - req.raise_for_status() - try: - output = req.json() - return output['_source'] - except: - if req.text: - exit(req.text) - else: - exit(url + ', returns None') - - def get_async(self, urls, LIMIT, _print, debug, action): - - async def get_single(url, session, auth): - url_suffix = url.rsplit('/', 1)[-1] - async with session.get(url) as response: - if response.status not in [200, 201]: - try: - output = await response.json() - except: - output = await response.text() - problem = str(output) - if debug: - return {'status_error': url_suffix} - # returns 'found':False... real data doesnt have 'found'; see the problem? - return None - # exit( - # str(problem) + ' with status code [' + - # str(response.status) + ']') - - output = await response.json(content_type=None) - if not output: - if debug: - return {'None': url_suffix, 'failed': url_suffix} - exit(url + ' returns NoneType object') - elif output.get('errormsg'): - if debug: - return {'errormsg': url_suffix, 'failed': url_suffix} - exit(url + ' returns errormsg: ' + str(output['errormsg'])) - elif not output.get('_source'): - if debug: - return {'unknown': url_suffix, 'failed': url_suffix} - exit(url + ' unknown error ' + str(output)) - else: - if debug: - return {'success': url_suffix} - if _print: - print(output['_source']) - return output['_source'] - - async def get_all(urls, connector, loop): - if _print: - print('=== {0} ==='.format(action)) - tasks = [] - auth = BasicAuth(self.username, self.password) - async with ClientSession( - connector=connector, loop=loop, auth=auth) as session: - for i, url in enumerate(urls): - task = asyncio.ensure_future(get_single(url, session, auth)) - tasks.append(task) - return (await asyncio.gather(*tasks)) - - connector = TCPConnector( - limit=LIMIT - ) # rate limiter; should be between 20 and 80; 100 maxed out server - loop = asyncio.get_event_loop() # event loop initialize - future = asyncio.ensure_future( - get_all(urls, connector, loop)) # tasks to do; data is in json format [{},] - outputs = loop.run_until_complete(future) # loop until done - return outputs - # return {k: v for keyval in outputs for k, v in keyval.items()} - - def search_by_ilx_id(self, ilx_id): - ilx_id = str(ilx_id) - ilx_id = ilx_id if 'ilx_' in ilx_id else ('ilx_' + ilx_id) - url = '/'.join([self.base_url, 'term', ilx_id]) - return self.get(url, auth=(self.username, self.password)) - - def search_by_ilx_ids(self, ilx_ids, LIMIT=25, _print=True, debug=False): - urls = [] - for ilx_id in ilx_ids: - ilx_id = str(ilx_id) - ilx_id = ilx_id if 'ilx_' in ilx_id else ('ilx_' + ilx_id) - urls.append('/'.join([self.base_url, 'term', ilx_id])) - action = 'Searching Elastic via ILX IDs' - return self.get_async(urls, LIMIT=LIMIT, _print=_print, debug=debug, action=action) - - -def batch(data, seg_length, func, **kwargs): - total_data = [data[x:x + seg_length] - for x in range(0, len(data), seg_length)] - total_count = m.floor(len(data) / seg_length) - output = [] - for i, data in enumerate(total_data[:1], 0): - print('Batch', i, 'out of', total_count) - output.extend(func(data, **kwargs)) - print(output) - return output - - -def main(): - doc = docopt(__doc__, version=VERSION) - args = doc2args(doc) - es = ElasticSearch(user='keys/.elastic_username', - password='keys/.elastic_password',) - hit = es.search_by_ilx_id('ilx_0101431') - embed() - #hit = es.search_by_ilx_id(ilx_id='ilx_0101431') - #hit = es.search_by_ilx_ids(ilx_ids=['ilx_010143'], _print=False, debug=True) - #terms = open_pickle(p.home() / 'Dropbox/interlex_backups/ilx_db_terms_backup') - #ilx_ids = list(terms.ilx) - #records = batch(ilx_ids[:1000], 100, - # es.search_by_ilx_ids, _print=False, debug=True) - #df = pd.DataFrame.from_records(records) - #print(list(df)) - #create_json(list(df.failed), p.home() / 'Dropbox/failed_elastic') - - -if __name__ == '__main__': - main() diff --git a/ilxutils/ilxutils/interlex_sanity_checks.py b/ilxutils/ilxutils/interlex_sanity_checks.py deleted file mode 100644 index d953953b..00000000 --- a/ilxutils/ilxutils/interlex_sanity_checks.py +++ /dev/null @@ -1,4 +0,0 @@ -from .sql import production_sql - -ilx_sql = production_sql(from_backup=True) -ex = ilx_sql.get_existing_ids() diff --git a/ilxutils/ilxutils/scicrunch_session.py b/ilxutils/ilxutils/scicrunch_session.py deleted file mode 100644 index 1c68a422..00000000 --- a/ilxutils/ilxutils/scicrunch_session.py +++ /dev/null @@ -1,187 +0,0 @@ -import json -from typing import Union, Dict, List, Tuple -from urllib.parse import urljoin - -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry - -from pyontutils.utils import Async, deferred - - -class SciCrunchSession: - """ Boiler plate for SciCrunch server responses. """ - - class Error(Exception): - """Script could not complete.""" - - class NoApiKeyError(Error): - """ No api key has been set """ - - class IncorrectAPIKeyError(Error): - """Incorrect API key for scicrunch website used.""" - - def __init__(self, - key: str, - host: str = 'test3.scicrunch.org', # MAIN TEST -> test3.scicrunch.org - auth: tuple = ('', ''), # user, password for authentication - retries: int = 3, # retries if code in status_forcelist - backoff_factor: float = 1.0, # delay factor for reties - status_forcelist: tuple = (500, 502, 504), # flagged codes for retry - ) -> None: - """ Initialize Session with SciCrunch Server. - - :param str key: API key for SciCrunch [should work for test hosts]. - :param str host: Base url for hosting server [can take localhost:8080]. - """ - self.key = key - self.host = '' - self.api = '' - - # Pull host for potential url - if host.startswith('http'): - host = urlparse(host).netloc - - # Use host to create api url - if host.startswith('localhost'): - self.host = "http://" + host - self.api = self.host + '/api/1/' - else: - self.host = "https://" + host - self.api = self.host + '/api/1/' - - # Api key check - if self.key is None: # injected by orthauth - # Error here because viewing without a key handled in InterLexRemote not here - raise self.NoApiKeyError('You have not set an API key for the SciCrunch API!') - if not requests.get(self.api+'user/info', params={'key':self.key}).status_code in [200, 201]: - raise self.IncorrectAPIKeyError(f'api_key given is incorrect.') - - self.session = requests.Session() - self.session.auth = auth - self.session.headers.update({'Content-type': 'application/json'}) - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, # 400 for no ILX ID generated. - ) - adapter = HTTPAdapter(max_retries=retry) - self.session.mount('http://', adapter) - self.session.mount('https://', adapter) - - def __session_shortcut(self, endpoint: str, data: dict, session_type: str = 'GET') -> dict: - """ Short for both GET and POST. - - Will only crash if success is False or if there a 400+ error. - """ - def _prepare_data(data: dict) -> dict: - """ Check if request data inputed has key and proper format. """ - if data is None: - data = {'key': self.key} - elif isinstance(data, dict): - data.update({'key': self.key}) - else: - raise ValueError('request session data must be of type dictionary') - return json.dumps(data) - - # urljoin bug; .com/ap1/1/ + /test/ != .com/ap1/1/test/ but .com/test/ - # HOWEVER .com/ap1/1/ + test/ == .com/ap1/1/test/ - endpoint = endpoint[1:] if endpoint.startswith('/') else endpoint - url = urljoin(self.api, endpoint) - if data: - for key, value in data.items(): - url = url.format(**{key:value}) - data = _prepare_data(data) - # TODO: Could use a Request here to shorten code. - if session_type == 'GET': - response = self.session.get(url, data=data) - else: - response = self.session.post(url, data=data) - try: - # crashes if success on the server side is False - if response.json()['success'] == False: - # Need to retry if server fails to create the ILX ID. - raise ValueError(response.text + f' -> STATUS CODE: {response.status_code} @ URL: {response.url}') - response.raise_for_status() - # crashes if the server couldn't use it or it never made it. - except: - raise requests.exceptions.HTTPError(f'{response.text} {response.status_code}') - - # response.json() == {'data':{}, 'success':bool} - return response.json()['data'] - - def _get(self, endpoint: str, data: dict = None) -> dict: - """ Quick GET for SciCrunch. """ - return self.__session_shortcut(endpoint, data, 'GET') - - def _post(self, endpoint: str , data: dict = None) -> dict: - """ Quick POST for SciCrunch. """ - return self.__session_shortcut(endpoint, data, 'POST') - - def get(self, endpoint, data_list, tag=None) -> List[Tuple[str, dict]]: - # worker - gin = lambda endpoint, data: (tag, self._get(endpoint, data)) - # Builds futures dynamically - return Async()(deferred(gin)(endpoint, data) for endpoint, data in zip(endpoint, data_list)) - - def post(self, endpoint: object, data_list: list) -> List[Tuple[str, dict]]: - # worker; return server_response first then initial data input - gin = lambda data: (data, self._post(endpoint, data)) - - # Builds futures dynamically - responses = Async()(deferred(gin)(data) for data in data_list) - return responses - - # def post(self, func: object, data_list: list) -> List[Tuple[str, dict]]: - # # worker; return server_response first then initial data input - # gin = lambda data: (data, func(data)) - # - # # Builds futures dynamically - # responses = Async()(deferred(gin)(data) for data in data_list) - # - # # BUG: ilx_ids are created on the PHP side and are slow. Duplicates - # # are known to be created "func hit at same time" so we need to a new - # # session and try again. - # number_of_batch_retries = 0 - # while number_of_batch_retries < 10: - # data_queue = [] - # for response in responses: - # data, server_response = response - # print(server_response) - # if server_response.get('errormsg') == 'could not generate ILX identifier': - # data_queue.append(data) - # if data_queue == []: - # break - # responses = Async()(deferred(gin)(data) for data in data_queue) - # number_of_batch_retries += 1 - # return - - # def get(self, urls, limit=5): - # - # async def get_single(url, session, auth): - # async with session.get(url) as response: - # try: - # output = await response.json() - # except: - # output = await response.text() - # ValueError(f'{output} with status code [{response.status}]') - # return output - # - # async def get_all(urls, connector, loop): - # tasks = [] - # async with ClientSession(connector=connector, loop=loop, - # auth=self.auth, raise_for_status=True) as session: - # for i, url in enumerate(urls): - # task = asyncio.ensure_future(get_single(url, session, self.auth)) - # tasks.append(task) - # return (await asyncio.gather(*tasks)) - # - # # rate limiter; should be between 20 and 80; 100 maxed out server - # connector = TCPConnector(limit=limit) - # loop = asyncio.get_event_loop() # event loop initialize - # # tasks to do; data is in json format [{},] - # future = asyncio.ensure_future(get_all(urls, connector, loop)) - # outputs = loop.run_until_complete(future) # loop until done - # return {k: v for keyval in outputs for k, v in keyval.items()} diff --git a/ilxutils/ilxutils/sparql.ipynb b/ilxutils/ilxutils/sparql.ipynb deleted file mode 100644 index fb11bab8..00000000 --- a/ilxutils/ilxutils/sparql.ipynb +++ /dev/null @@ -1,296 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from pyontutils.core import OntResIri\n", - "g = OntResIri('https://cassava.ucsd.edu/sparc/exports/curation-export.ttl').graph" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "q2=\"\"\"\n", - "SELECT ?subj ?pred ?obj\n", - "WHERE {\n", - " TEMP:hasDerivedInformationAsParticipant ?obj .\n", - "?subj TEMP:hasDerivedInformationAsParticipant ?obj .\n", - "}\n", - "\"\"\"\n", - "\n", - "templates = SparqlQueryTemplates(g.namespace_manager)\n", - "q = templates.dataset_group(\n", - " subject='https://api.blackfynn.io/datasets/N:dataset:bc4071fd-aba1-4fe5-a59e-3da5affbc5fb/subjects/10653',\n", - ")\n", - "# print(q)\n", - "ts = []\n", - "sp = g.query(q2)" - ] - }, - { - "cell_type": "code", - "execution_count": 211, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 211, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from typing import Union, Dict, List, Tuple\n", - "import unittest\n", - "\n", - "import rdflib\n", - "from rdflib.plugins import sparql\n", - "import pytest\n", - "\n", - "from pyontutils.core import OntResIri\n", - "\n", - "Semantic = Union[rdflib.URIRef, rdflib.Literal, rdflib.BNode]\n", - "\n", - "\n", - "class TestCurationExportTtl:\n", - "\n", - " def __init__(self):\n", - " self.ori = OntResIri('https://cassava.ucsd.edu/sparc/exports/curation-export.ttl')\n", - " self.graph = self.ori.graph\n", - " self.spaql_templates = SparqlQueryTemplates(self.graph)\n", - "\n", - " def test_dataset_group(self):\n", - " \"\"\" sparql queries here \"\"\"\n", - " subj = rdflib.URIRef('https://api.blackfynn.io/datasets/N:dataset:c2a014b8-2c15-4269-b10a-3345420e3d56/subjects/53')\n", - " query = self.spaql_templates.dataset_group()\n", - " assert len(list(self.graph.query(query, initBindings={'target': subj}))) > 0\n", - "\n", - " def test_related_datasets(self):\n", - " subj = rdflib.util.from_n3('dataset:bec4d335-9377-4863-9017-ecd01170f354', nsm=self.graph)\n", - " query = self.spaql_templates.related_datasets()\n", - " assert len(list(self.graph.query(query, initBindings={'target': subj}))) > 0\n", - "\n", - "\n", - "class SparqlQueryTemplates:\n", - " \"\"\" Creates SPARQL query templates. \"\"\"\n", - "\n", - " def __init__(self, nsm=None):\n", - " self.nsm = nsm if nsm else rdflib.Graph().namespace_manager\n", - " self.prefixes = {p:ns for p, ns in self.nsm.namespaces() if p}\n", - "\n", - " def sparql_iri(self, iri: Union[rdflib.URIRef, str]) -> str:\n", - " \"\"\" Converts IRIs and curies to a usable format for SPARQL queries. \"\"\"\n", - " if iri.startswith('http') or isinstance(iri, rdflib.URIRef):\n", - " return '<'+str(iri)+'>'\n", - " return iri\n", - "\n", - " def dataset_group(self) -> str:\n", - " \"\"\" Get all subject groups and dataset associated with subject input.\n", - "\n", - " :returns: list of tuples containing: subject, subjects group, and subjects dataset.\n", - " \"\"\"\n", - " query = \"\"\"\n", - " SELECT ?subj ?group ?dataset\n", - " WHERE {\n", - " ?target TEMP:hasAssignedGroup ?group .\n", - " ?subj TEMP:hasAssignedGroup ?group .\n", - " ?subj TEMP:hasDerivedInformationAsParticipant ?dataset .\n", - " }\n", - " \"\"\"\n", - " return sparql.prepareQuery(query, initNs=self.prefixes)\n", - "\n", - " def related_datasets(self) -> str:\n", - " \"\"\" Get all related datasets of subject.\n", - "\n", - " :returns: list of tuples containing: subject & subjects shared dataset.\n", - " \"\"\"\n", - " query = \"\"\"\n", - " SELECT ?subj ?dataset\n", - " WHERE {\n", - " ?target TEMP:collectionTitle ?dataset .\n", - " ?subj TEMP:collectionTitle ?dataset .\n", - " }\n", - " \"\"\"\n", - " return sparql.prepareQuery(query, initNs=self.prefixes)\n", - " \n", - "TestCurationExportTtl().test_dataset_group()" - ] - }, - { - "cell_type": "code", - "execution_count": 205, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 205, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "t.test_dataset_group()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from rdflib import RDFS\n", - "from_n3('')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "module 'rdflib.plugins' has no attribute 'sparql'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrdflib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplugins\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msparql\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepareQuery\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitN\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m: module 'rdflib.plugins' has no attribute 'sparql'" - ] - } - ], - "source": [ - "rdflib.plugins.sparql.prepareQuery(query, initN)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from rdflib.plugins import sparql" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "rdflib==5.0.0.dev0\n" - ] - } - ], - "source": [ - "!pip3 freeze | grep rdflib" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "ename": "SyntaxError", - "evalue": "invalid syntax (, line 1)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m from rdflib.plugins\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], - "source": [ - "from rdflib.plugins.sp" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "module 'rdflib' has no attribute 'plugins'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrdflib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mrdflib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplugins\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msparql\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepareQuery\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m: module 'rdflib' has no attribute 'plugins'" - ] - } - ], - "source": [ - "import rdflib\n", - "rdflib.plugins.sparql.prepareQuery" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from pyontutils.core import OntResIri\n", - "from rdflib.plugins.sparql import prepareQuery" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/ilxutils/ilxutils/sql.py b/ilxutils/ilxutils/sql.py deleted file mode 100644 index b3470e74..00000000 --- a/ilxutils/ilxutils/sql.py +++ /dev/null @@ -1,16 +0,0 @@ -from .interlex_sql import IlxSql -import os - -def production_sql(from_backup=True): - return IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_PRODUCTION'), from_backup=from_backup) - -def beta_sql(from_backup=True): - # TEST{#} should be a thing since this still relies on main sql test - return IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_BETA'), from_backup=from_backup) - -# entities = [] -# for ilx, group in ex.groupby('ilx'): -# if not any(list(group['preferred'] == '1')): -# entities.append(ilx) - -# from ontquery.plugins.services.interlex_client import InterLexClient From 6fae27aac3df7b3072c062691cfddb4c9bbf2647 Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Thu, 1 Oct 2020 16:39:06 -0700 Subject: [PATCH 15/16] - --- ilxutils/ilxutils/elastic_search.py | 159 +++++++++++++++++++++ ilxutils/ilxutils/elasticsearch_wrapper.py | 119 --------------- 2 files changed, 159 insertions(+), 119 deletions(-) create mode 100644 ilxutils/ilxutils/elastic_search.py delete mode 100644 ilxutils/ilxutils/elasticsearch_wrapper.py diff --git a/ilxutils/ilxutils/elastic_search.py b/ilxutils/ilxutils/elastic_search.py new file mode 100644 index 00000000..a600e916 --- /dev/null +++ b/ilxutils/ilxutils/elastic_search.py @@ -0,0 +1,159 @@ +""" To quick test Elastic to make sure it's behaving normally + +Usage: + elastic_search.py [-h | --help] + elastic_search.py [-v | --version] + elastic_search.py [--username=] [--password=] + +Options: + -h, --help Display this help message + -v, --version Current version of file + -u, --username= Username for Elastic [default: keys/elastic_username.txt] + -p, --password= Password for Elastic [default: keys/elastic_password.txt] +""" +from docopt import docopt +import pandas as pd +from pathlib import Path as p +import requests as r +from sys import exit +import asyncio +from aiohttp import ClientSession, TCPConnector, BasicAuth +import math as m +from IPython import embed +from ilxutils.tools import * +from ilxutils.args_reader import doc2args +VERSION = '0.0.2' + +# TODO: will have to asyncio it later +# TODO: basic auth search a list of ids to get it ready for concurrent searchs; crawl str or list < 10 elements + + +class ElasticSearch: + def __init__(self, user, password): + self.username, self.password = self.clean_auth(user, password) + self.base_url = 'https://5f86098ac2b28a982cebf64e82db4ea2.us-west-2.aws.found.io:9243/interlex' + + def clean_auth(self, *args): + return tuple(map(open_txt, filter(is_file, map(fix_path, args)))) + + def get(self, url, data=None, auth=None): + req = r.get(url, data=data, auth=auth) + req.raise_for_status() + try: + output = req.json() + return output['_source'] + except: + if req.text: + exit(req.text) + else: + exit(url + ', returns None') + + def get_async(self, urls, LIMIT, _print, debug, action): + + async def get_single(url, session, auth): + url_suffix = url.rsplit('/', 1)[-1] + async with session.get(url) as response: + if response.status not in [200, 201]: + try: + output = await response.json() + except: + output = await response.text() + problem = str(output) + if debug: + return {'status_error': url_suffix} + # returns 'found':False... real data doesnt have 'found'; see the problem? + return None + # exit( + # str(problem) + ' with status code [' + + # str(response.status) + ']') + + output = await response.json(content_type=None) + if not output: + if debug: + return {'None': url_suffix, 'failed': url_suffix} + exit(url + ' returns NoneType object') + elif output.get('errormsg'): + if debug: + return {'errormsg': url_suffix, 'failed': url_suffix} + exit(url + ' returns errormsg: ' + str(output['errormsg'])) + elif not output.get('_source'): + if debug: + return {'unknown': url_suffix, 'failed': url_suffix} + exit(url + ' unknown error ' + str(output)) + else: + if debug: + return {'success': url_suffix} + if _print: + print(output['_source']) + return output['_source'] + + async def get_all(urls, connector, loop): + if _print: + print('=== {0} ==='.format(action)) + tasks = [] + auth = BasicAuth(self.username, self.password) + async with ClientSession( + connector=connector, loop=loop, auth=auth) as session: + for i, url in enumerate(urls): + task = asyncio.ensure_future(get_single(url, session, auth)) + tasks.append(task) + return (await asyncio.gather(*tasks)) + + connector = TCPConnector( + limit=LIMIT + ) # rate limiter; should be between 20 and 80; 100 maxed out server + loop = asyncio.get_event_loop() # event loop initialize + future = asyncio.ensure_future( + get_all(urls, connector, loop)) # tasks to do; data is in json format [{},] + outputs = loop.run_until_complete(future) # loop until done + return outputs + # return {k: v for keyval in outputs for k, v in keyval.items()} + + def search_by_ilx_id(self, ilx_id): + ilx_id = str(ilx_id) + ilx_id = ilx_id if 'ilx_' in ilx_id else ('ilx_' + ilx_id) + url = '/'.join([self.base_url, 'term', ilx_id]) + return self.get(url, auth=(self.username, self.password)) + + def search_by_ilx_ids(self, ilx_ids, LIMIT=25, _print=True, debug=False): + urls = [] + for ilx_id in ilx_ids: + ilx_id = str(ilx_id) + ilx_id = ilx_id if 'ilx_' in ilx_id else ('ilx_' + ilx_id) + urls.append('/'.join([self.base_url, 'term', ilx_id])) + action = 'Searching Elastic via ILX IDs' + return self.get_async(urls, LIMIT=LIMIT, _print=_print, debug=debug, action=action) + + +def batch(data, seg_length, func, **kwargs): + total_data = [data[x:x + seg_length] + for x in range(0, len(data), seg_length)] + total_count = m.floor(len(data) / seg_length) + output = [] + for i, data in enumerate(total_data[:1], 0): + print('Batch', i, 'out of', total_count) + output.extend(func(data, **kwargs)) + print(output) + return output + + +def main(): + doc = docopt(__doc__, version=VERSION) + args = doc2args(doc) + es = ElasticSearch(user='keys/.elastic_username', + password='keys/.elastic_password',) + hit = es.search_by_ilx_id('ilx_0101431') + embed() + #hit = es.search_by_ilx_id(ilx_id='ilx_0101431') + #hit = es.search_by_ilx_ids(ilx_ids=['ilx_010143'], _print=False, debug=True) + #terms = open_pickle(p.home() / 'Dropbox/interlex_backups/ilx_db_terms_backup') + #ilx_ids = list(terms.ilx) + #records = batch(ilx_ids[:1000], 100, + # es.search_by_ilx_ids, _print=False, debug=True) + #df = pd.DataFrame.from_records(records) + #print(list(df)) + #create_json(list(df.failed), p.home() / 'Dropbox/failed_elastic') + + +if __name__ == '__main__': + main() diff --git a/ilxutils/ilxutils/elasticsearch_wrapper.py b/ilxutils/ilxutils/elasticsearch_wrapper.py deleted file mode 100644 index e9767e0e..00000000 --- a/ilxutils/ilxutils/elasticsearch_wrapper.py +++ /dev/null @@ -1,119 +0,0 @@ -from functools import wraps -import json -import os -import subprocess -import docopt -from elasticsearch import Elasticsearch -BASHRC = lambda s: os.environ.get(s) - - -class ElasticSearchTools: - """ Shortcuts for common elasticsearch querys. """ - - def __init__(self, - host: str, index: str, type: str, - user: str, password: str, - size: int = 10, start: int = 0, - scheme: str = 'https',) -> None: - """ - :param str url: ElasticSearch url endpoint. - :param str index: - """ - self.url = f'{scheme}://{host}/{index}' - self.host, self.index, self.type = host, index, type - self.es = Elasticsearch(self.url, http_auth=(user, password)) - - def search(self, body: dict, **kwargs) -> dict: - """ Elasticsearch '/_search' feature. - - We use a framented index called a type. The type is the last index - while the real index becomes part of the host url. - - :param dict body: query dict. - :return: nested elasticsearch dict where hits are in ['hits']['hits'] - - >>>__search(body={ 'query': { 'match_all': {} } }) - """ - return self.es.search(index=self.type, body=body, **kwargs) - - def scroll(self, body: dict, size: int, **kwargs) -> dict: - body['size'] = 10000 - body['from'] = 0 - hits = [] - print(body) - for step in range(0, size, 10000): - hits += self.es.search(index=self.type, body=body, **kwargs)['hits']['hits'] - body['from'] = step - print(body) - return hits - - def all_matches(self, sorting: str, size, start) -> dict: - """First or last set of entities. - - :param str sorting: asc for head or desc for tail. - :param int size: number of entities you want from head or tails. - :param int start: position of index you want to start from. - :return: elasticsearch _search dict - """ - if sorting.lower().strip() not in ['asc', 'desc']: - raise ValueError('sorting can only be asc or desc.') - body = { - 'query': { 'match_all': {} }, - 'sort': [ { '_id': sorting } ], - 'size': size, - 'from': start, - } - return self.search(body) - - def head(self, size=10, start=0): - """ See __end doc. """ - return self.all_matches(sorting='asc', size=size, start=start) - - def tail(self, size=10, start=0): - """ See __end doc. """ - return self.all_matches(sorting='desc', size=size, start=start) - - -class InterLexES(ElasticSearchTools): - - def __init__(self, beta=True): - super().__init__( - host = BASHRC('SCICRUNCH_ELASTIC_URL'), - # index = 'interlex_2019oct28', - index = 'interlex', - type = 'term', - user = BASHRC('INTERLEX_ELASTIC_USER'), - password = BASHRC('INTERLEX_ELASTIC_PASSWORD'), - ) - self.beta = beta - - def filter_tmp(self): - prefix = 'tmp_' if self.beta else 'ilx_' - return { 'prefix': { 'ilx' : { 'value': prefix } } } - - def all_matches(self, sorting: str, size, start) -> dict: - """First or last set of entities. - - :param str sorting: asc for head or desc for tail. - :param int size: number of entities you want from head or tails. - :param int start: position of index you want to start from. - :return: elasticsearch _search dict - """ - if sorting.lower().strip() not in ['asc', 'desc']: - raise ValueError('sorting can only be asc or desc.') - body = { - 'query': self.filter_tmp(), - 'sort': [ { '_id': sorting } ], - 'size': size, - 'from': start, - } - return self.search(body) - - -def main(): - ilxes = InterLexES(beta=False) - print(ilxes.tail(1)) - - -if __name__ == '__main__': - main() From 15c70bd3d225d4a2181bed7c5d4e42a4c592238c Mon Sep 17 00:00:00 2001 From: Troy Sincomb Date: Wed, 21 Oct 2020 09:13:37 -0700 Subject: [PATCH 16/16] sparc-view.ttl added to load --- pyontutils/ontload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyontutils/ontload.py b/pyontutils/ontload.py index 24decc66..49dcea14 100755 --- a/pyontutils/ontload.py +++ b/pyontutils/ontload.py @@ -586,7 +586,7 @@ def loadall(git_local, repo_name, local=False, dobig=False): done = [] git_path = '/home/tmsincomb/Dropbox/git/' - hardcoded_files = [repo_name + '/' + 'extra.ttl'] + [git_path+'fma_slim.ttl', git_path+'emapa.ttl', git_path+'uberon.ttl', git_path+'mondo.ttl'] + hardcoded_files = [repo_name + '/' + 'extra.ttl'] + [git_path+'sparc-view.ttl', git_path+'fma_slim.ttl', git_path+'emapa.ttl', git_path+'uberon.ttl', git_path+'mondo.ttl'] filenames = hardcoded_files + [f for g in ('*', '*/*', '*/*/*') for f in glob(lb_ttl + '/' + g + '.ttl')] graph = OntGraph() for f in filenames: