Source code for spatialmeta.preprocess._metabolite_annotation

from molmass import Formula
import pandas as pd
import anndata
from scipy.spatial import cKDTree
from pathlib import Path
MODULE_PATH = Path(__file__).parent
from ..util._classes import AnnDataSM

def _calculate_ppm_range(
    observed_mz, 
    ppm_tolerance=5
):
    ppm_range = observed_mz * (ppm_tolerance / 1e6)
    lower_limit = observed_mz - ppm_range
    upper_limit = observed_mz + ppm_range
    return pd.Series([ppm_range, lower_limit, upper_limit], index=['ppm_range', 'lower_limit', 'upper_limit'])

[docs]def metabolite_annotation( adata_SM:AnnDataSM, adduct_type: str, adduct_method: str, tolerance_ppm: float = 5, inplace: bool = True ) -> pd.DataFrame: """ Annotate metabolites based on the m/z values. :param adata_SM: AnnDataSM. The AnnDataSM object. :param adduct_type: str. The adduct type. :param adduct_method: str. The adduct method, 'add' or 'sub'. :param tolerance_ppm: float, default 5. The tolerance in ppm. :param inplace: bool, default True. Whether to modify the AnnDataSM object inplace. :return: pd.DataFrame. The annotated metabolites. """ sf_results_df = pd.read_csv(MODULE_PATH / "../data/hmdb.csv", index_col=0) if adduct_type==None: adduct_mz = 0 else: f_adduct = Formula(adduct_type) adduct_mz = f_adduct.monoisotopic_mass if adduct_method == 'add': sf_results_df['m/z_addion'] = sf_results_df['monisotopic_molecular_weight'] + adduct_mz elif adduct_method == 'sub': sf_results_df['m/z_addion'] = sf_results_df['monisotopic_molecular_weight'] - adduct_mz else: raise ValueError("adduct_method should be 'add' or 'sub'") sf_results_df[['ppm_range', 'lower_limit', 'upper_limit']] = sf_results_df['m/z_addion'].apply( lambda x: _calculate_ppm_range(x, ppm_tolerance=tolerance_ppm)) var_df = adata_SM.var.copy() var_df['name'] = var_df['name'].astype(float) var_df['accession'] = None var_df['metabolite_name'] = None var_df['iupac_name']=None var_df["chemical_formula"]=None var_df['kegg']=None var_df['bigg']=None var_df['direct_parent']=None var_df['class']=None var_df['sub_class']=None sf_results_df['lower_limit'] = sf_results_df['lower_limit'].astype(float) sf_results_df['upper_limit'] = sf_results_df['upper_limit'].astype(float) #sf_results_df = sf_results_df.sort_values('lower_limit') kdtree = cKDTree(sf_results_df.loc[:,['lower_limit','upper_limit']]) for index,row in var_df.iterrows(): #print(index) mz_target = row['name'] query_point=[mz_target,mz_target] distance, index2 = kdtree.query(query_point) lower_limit = float(sf_results_df.loc[index2]['lower_limit']) upper_limit = float(sf_results_df.loc[index2]['upper_limit']) mz_target = float(mz_target) if lower_limit <= mz_target <= upper_limit: sf_accession = sf_results_df.loc[index2]['accession'] sf_name = sf_results_df.loc[index2]['name'] sf_iupac_name = sf_results_df.loc[index2]['iupac_name'] sf_chemical_formula = sf_results_df.loc[index2]['chemical_formula'] sf_kegg = sf_results_df.loc[index2]['kegg'] sf_bigg = sf_results_df.loc[index2]['bigg'] sf_direct_parent = sf_results_df.loc[index2]['direct_parent'] sf_class = sf_results_df.loc[index2]['class'] sf_subclass = sf_results_df.loc[index2]['sub_class'] else: sf_accession = None sf_name = None sf_iupac_name = None sf_chemical_formula = None sf_kegg = None sf_bigg = None sf_direct_parent = None sf_class = None sf_subclass = None var_df.loc[index,'accession'] = sf_accession var_df.loc[index,'metabolite_name'] = sf_name var_df.loc[index,'iupac_name'] = sf_iupac_name var_df.loc[index,'chemical_formula'] = sf_chemical_formula var_df.loc[index,'kegg'] = sf_kegg var_df.loc[index,'bigg'] = sf_bigg var_df.loc[index,'direct_parent'] = sf_direct_parent var_df.loc[index,'class'] = sf_class var_df.loc[index,'sub_class'] = sf_subclass if inplace: adata_SM.var['accession'] = var_df['accession'].values adata_SM.var['metabolite_name'] = var_df['metabolite_name'].values adata_SM.var['iupac_name'] = var_df['iupac_name'].values adata_SM.var['chemical_formula'] = var_df['chemical_formula'].values adata_SM.var['kegg'] = var_df['kegg'].values adata_SM.var['bigg'] = var_df['bigg'].values adata_SM.var['direct_parent'] = var_df['direct_parent'].values adata_SM.var['class'] = var_df['class'].values adata_SM.var['sub_class'] = var_df['sub_class'].values return var_df