Skip to content

Commit

Permalink
HGNC ROBOT template
Browse files Browse the repository at this point in the history
- Delete: source_code column (w/ values: MONDO:OMIM)
- Bug fix: No longer adding exact match gene annotations if >1 gene associated with MIM.
  • Loading branch information
joeflack4 committed Jun 18, 2024
1 parent cb615e8 commit 6b3a48d
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions omim2obo/mondo_omim_genes_robot_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
'mondo_id': 'ID',
'hgnc_id': "SC 'has material basis in germline mutation in' some %",
'omim_disease_xref': '>A oboInOwl:source',
'source_code': '>A oboInOwl:source',
'omim_gene': '',
}

Expand All @@ -24,23 +23,24 @@ def mondo_omim_genes_robot_tsv(inpath: Union[Path, str], outpath: Union[Path, st
# Remove the first character, a question mark (?), from each field in the header; an artefact of the SPARQL query.
df.rename(columns={col: col[1:] for col in df.columns if col.startswith('?')}, inplace=True)

# Add source_code column
df['source_code'] = 'MONDO:OMIM'

# Remove < and > characters from specified columns
uri_cols = ['mondo_id', 'hgnc_id', 'omim_gene']
for col in uri_cols:
df[col] = remove_angle_brackets(list(df[col]))

# Insert ROBOT subheader
df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df])

# Format col order
df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'source_code', 'omim_gene']]
df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'omim_gene']]

# Sort
df = df.sort_values(by=['mondo_id', 'hgnc_id', 'omim_gene', 'omim_disease_xref'])

# Remove cases where >1 gene association
# - These indicate non-causal relationships, which we don't care about.
df = df[~df['omim_disease_xref'].duplicated(keep=False)]

# Insert ROBOT subheader
df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df])

df.to_csv(outpath, sep='\t', index=False)
return pd.DataFrame()

Expand Down

0 comments on commit 6b3a48d

Please sign in to comment.