h5ad_ID2symbol

[ ]:
import anndata as ad
import pandas as pd
import numpy as np
from gseapy import Biomart
adata = ad.read_h5ad("kidney_tumors.h5ad")
[3]:
adata.var.head(2)
[3]:
vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable feature_is_filtered feature_name feature_reference feature_biotype
ENSG00000243485 0.000216 0.000216 0.000216 0.999738 0 0 15855 0 0
ENSG00000237613 0.000000 0.000000 0.000000 0.000000 0 0 8370 0 0
[7]:
bm = Biomart()
[72]:
var_names = adata.var_names.to_list()

df = pd.DataFrame()

for i in range(1,int(len(var_names)/400) +2):
    subl = []
    if i*400>=len(var_names):
        subl = var_names[(i-1)*400:len(var_names)]
    else:
        subl = var_names[(i-1)*400:i*400]

    queries ={'ensembl_gene_id': subl } # need to be a dict object
    results = bm.query(dataset='hsapiens_gene_ensembl',
                       attributes=['ensembl_gene_id', 'external_gene_name'],
                       filters=queries)
    if df.shape[0]==0:
        df = results
    else:
        df = pd.concat([df, results], axis=0)
[54]:
df.shape
[54]:
(32926, 2)
[76]:
df.index = df["ensembl_gene_id"]
df = df.drop_duplicates(keep="first")
[78]:
not_conver_index = df[df["external_gene_name"]!= df["external_gene_name"]].index.to_list()
df.loc[not_conver_index,"external_gene_name"] = not_conver_index
[79]:
df.shape
[79]:
(32844, 2)
[84]:
adata.var["external_gene_name"] = adata.var.index
have_symbol = adata.var.index[adata.var.index.isin(df.index)]

adata.var.loc[have_symbol, "external_gene_name"] = df.loc[have_symbol, "external_gene_name"]
[86]:
adata.var.index = adata.var["external_gene_name"]
[88]:
adata.var_names
[88]:
Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'ENSG00000238009', 'ENSG00000239945',
       'ENSG00000239906', 'ENSG00000241599', 'DDX11L17', 'WASH9P',
       'ENSG00000228463',
       ...
       'ENSG00000277196', 'ENSG00000277630', 'ENSG00000278384',
       'ENSG00000278633', 'ENSG00000276345', 'ENSG00000277856',
       'ENSG00000275063', 'ENSG00000271254', 'ENSG00000277475',
       'ENSG00000268674'],
      dtype='object', name='external_gene_name', length=32922)
[90]:
adata.var_names_make_unique()

index name can not be dup to the column name

[96]:
adata.var.index.name = "index_name"
[97]:
adata.write("kidney_tumors_convert_symbol.h5ad")
[ ]:

[ ]: