h5ad_ID2symbol
h5ad_ID2symbol
[ ]:
import anndata as ad
import pandas as pd
import numpy as np
from gseapy import Biomart
adata = ad.read_h5ad("kidney_tumors.h5ad")
[3]:
adata.var.head(2)
[3]:
| vst.mean | vst.variance | vst.variance.expected | vst.variance.standardized | vst.variable | feature_is_filtered | feature_name | feature_reference | feature_biotype | |
|---|---|---|---|---|---|---|---|---|---|
| ENSG00000243485 | 0.000216 | 0.000216 | 0.000216 | 0.999738 | 0 | 0 | 15855 | 0 | 0 |
| ENSG00000237613 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | 0 | 8370 | 0 | 0 |
[7]:
bm = Biomart()
[72]:
var_names = adata.var_names.to_list()
df = pd.DataFrame()
for i in range(1,int(len(var_names)/400) +2):
subl = []
if i*400>=len(var_names):
subl = var_names[(i-1)*400:len(var_names)]
else:
subl = var_names[(i-1)*400:i*400]
queries ={'ensembl_gene_id': subl } # need to be a dict object
results = bm.query(dataset='hsapiens_gene_ensembl',
attributes=['ensembl_gene_id', 'external_gene_name'],
filters=queries)
if df.shape[0]==0:
df = results
else:
df = pd.concat([df, results], axis=0)
[54]:
df.shape
[54]:
(32926, 2)
[76]:
df.index = df["ensembl_gene_id"]
df = df.drop_duplicates(keep="first")
[78]:
not_conver_index = df[df["external_gene_name"]!= df["external_gene_name"]].index.to_list()
df.loc[not_conver_index,"external_gene_name"] = not_conver_index
[79]:
df.shape
[79]:
(32844, 2)
[84]:
adata.var["external_gene_name"] = adata.var.index
have_symbol = adata.var.index[adata.var.index.isin(df.index)]
adata.var.loc[have_symbol, "external_gene_name"] = df.loc[have_symbol, "external_gene_name"]
[86]:
adata.var.index = adata.var["external_gene_name"]
[88]:
adata.var_names
[88]:
Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'ENSG00000238009', 'ENSG00000239945',
'ENSG00000239906', 'ENSG00000241599', 'DDX11L17', 'WASH9P',
'ENSG00000228463',
...
'ENSG00000277196', 'ENSG00000277630', 'ENSG00000278384',
'ENSG00000278633', 'ENSG00000276345', 'ENSG00000277856',
'ENSG00000275063', 'ENSG00000271254', 'ENSG00000277475',
'ENSG00000268674'],
dtype='object', name='external_gene_name', length=32922)
[90]:
adata.var_names_make_unique()
index name can not be dup to the column name
[96]:
adata.var.index.name = "index_name"
[97]:
adata.write("kidney_tumors_convert_symbol.h5ad")
[ ]:
[ ]: