Skip to content

Instantly share code, notes, and snippets.

@olegs
Created January 12, 2024 11:07
Show Gist options
  • Select an option

  • Save olegs/04022f5c35ef31af307087b15264e8ac to your computer and use it in GitHub Desktop.

Select an option

Save olegs/04022f5c35ef31af307087b15264e8ac to your computer and use it in GitHub Desktop.
Rapid unleashing of macrophage efferocytic capacity via transcriptional pause/release (commands.sh)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# This a notebook for ATAC-seq data analysis"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import os.path\n",
"\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import random\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"def factors_colormap(n):\n",
" if n <= 10:\n",
" return plt.cm.get_cmap('tab10', n)\n",
" if n <= 20:\n",
" return plt.cm.get_cmap('tab20', n)\n",
" else:\n",
" return plt.cm.get_cmap('nipy_spectral', n)\n",
"\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"SMALL_SIZE = 4\n",
"MEDIUM_SIZE = 6\n",
"BIGGER_SIZE = 10\n",
"\n",
"sns.set_style('white')\n",
"plt.rc('font', size=SMALL_SIZE) # controls default text sizes\n",
"plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title\n",
"plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels\n",
"plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels\n",
"plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels\n",
"plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize\n",
"plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"PATH = os.path.expanduser('~/data/2022_Atac-Seq-2')\n",
"! mkdir -p {PATH} /pics"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"def init_seed(seed):\n",
" random.seed(seed)\n",
" np.random.seed(seed)\n",
"\n",
"\n",
"init_seed(42)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join(PATH, 'diff_pairwise_counts.csv'))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"DATA_COLUMNS_ALL = ['MacAlone.1', 'MacAlone.2', 'MacAlone.3', 'MacAlone.4',\n",
" 'Mac.15min.1', 'Mac.15min.2', 'Mac.15min.3', 'Mac.15min.4',\n",
" 'Mac.30min_AC.1', 'Mac.30min_AC.2', 'Mac.30min_AC.3', 'Mac.30min_AC.4',\n",
" 'Mac.45min_AC.1', 'Mac.45min_AC.2', 'Mac.45min_AC.3', 'Mac.45min_AC.4']"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"plt.figure(figsize=(4, 6))\n",
"sns.heatmap(df[DATA_COLUMNS_ALL], cmap=plt.cm.coolwarm)\n",
"plt.title('All columns')\n",
"# plt.tight_layout()\n",
"plt.savefig(f'{PATH}/pics/all.pdf', bbox_inches='tight', dpi=300)\n",
"plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn import preprocessing\n",
"\n",
"plt.figure(figsize=(4, 6))\n",
"sns.heatmap(preprocessing.minmax_scale(df[DATA_COLUMNS_ALL], feature_range=(0, 1), axis=1, copy=True),\n",
" cmap=plt.cm.coolwarm)\n",
"plt.title('All columns')\n",
"# plt.tight_layout()\n",
"plt.savefig(f'{PATH}/pics/all_scaled.pdf', bbox_inches='tight', dpi=300)\n",
"plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"plt.figure(figsize=(4, 6))\n",
"scaled = preprocessing.StandardScaler().fit_transform(df[DATA_COLUMNS_ALL].T).T\n",
"scaled = scaled.clip(min=-2, max=2)\n",
"df_scaled = df.copy()\n",
"df_scaled[DATA_COLUMNS_ALL] = scaled\n",
"sns.heatmap(df_scaled[DATA_COLUMNS_ALL], cmap=plt.cm.coolwarm)\n",
"plt.title('All columns')\n",
"# plt.tight_layout()\n",
"plt.savefig(f'{PATH}/pics/data_scaled.pdf', bbox_inches='tight', dpi=300)\n",
"plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"OUTLIERS = ['MacAlone.1', 'Mac.30min_AC.1', 'Mac.45min_AC.2']\n",
"DATA_COLUMNS = [c for c in DATA_COLUMNS_ALL if c not in OUTLIERS]\n",
"df.drop(OUTLIERS, axis=1, inplace=True)\n",
"df"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.cluster import AgglomerativeClustering\n",
"from sklearn.cluster import KMeans\n",
"import numpy as np\n",
"\n",
"from collections import Counter\n",
"\n",
"\n",
"def get_clusters(x, n_clusters, method=KMeans):\n",
" # Ward linkage method as default with euclidean distance\n",
" print('Clustering', n_clusters)\n",
" return method(n_clusters=n_clusters).fit_predict(x).astype(int)\n",
"\n",
"\n",
"def reorder_clusters_by_size(clusters):\n",
" clusters = np.array(clusters)\n",
" cluster_counter = Counter()\n",
" for c in clusters:\n",
" cluster_counter[c] += 1\n",
" clusters_reord = np.zeros(len(clusters), dtype=int)\n",
" for i, (c, n) in enumerate(cluster_counter.most_common()):\n",
" clusters_reord[np.array(clusters) == c] = i\n",
" return clusters_reord\n",
"\n",
"\n",
"def clusters_sizes(clusters, cmap):\n",
" t = pd.DataFrame(dict(cluster=clusters, size=np.ones(len(clusters))))\n",
" t_sum = t[['cluster', 'size']].groupby(['cluster']).sum().reset_index()\n",
" plt.figure(figsize=(10, 4))\n",
" sns.barplot(x=t_sum.index, y=t_sum['size'],\n",
" palette=[cmap(i) for i in range(len(t_sum))])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"N_CLUSTERS = 10\n",
"\n",
"clusters = reorder_clusters_by_size(get_clusters(df_scaled[DATA_COLUMNS], N_CLUSTERS))\n",
"\n",
"cmap = factors_colormap(N_CLUSTERS)\n",
"\n",
"plt.figure(figsize=(4, 1))\n",
"clusters_sizes(clusters, cmap)\n",
"plt.title('Clusters')\n",
"plt.tight_layout()\n",
"plt.savefig(f'{PATH}/pics/clusters.pdf', bbox_inches='tight', dpi=300)\n",
"plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"df_scaled['Cluster'] = clusters\n",
"df_scaled.sort_values(['Cluster'], inplace=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"row_colors = [cmap(c) for c in df_scaled['Cluster']]\n",
"sns.clustermap(df_scaled[DATA_COLUMNS],\n",
" col_cluster=False, row_cluster=False,\n",
" row_colors=row_colors,\n",
" figsize=(4, 6), cmap=plt.cm.coolwarm)\n",
"plt.title('All columns')\n",
"plt.tight_layout()\n",
"plt.savefig(f'{PATH}/pics/with_clusters.pdf', bbox_inches='tight', dpi=300)\n",
"plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"df_scaled"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Save cluster BED files"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"! mkdir -p {PATH} /clusters\n",
"\n",
"for c in range(N_CLUSTERS):\n",
" dfc = df_scaled[df_scaled['Cluster'] == c]\n",
" t = dfc[['seqnames', 'start', 'end']].copy()\n",
" t.sort_values(['seqnames', 'start', 'end'], inplace=True)\n",
" t.to_csv(f'/{PATH}/clusters/diff_pairwise_counts_{c}.bed3',\n",
" header=None, index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Hierarchical clustering"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"sns.clustermap(df_scaled[DATA_COLUMNS],\n",
" col_cluster=False, row_cluster=True,\n",
" row_colors=row_colors,\n",
" figsize=(10, 10), cmap=plt.cm.coolwarm)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Heatmap of promoter regions at timepoints"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"GENES = [\n",
" 'PLK2',\n",
" 'MIR22HG',\n",
" 'ID3',\n",
" 'GADD45G',\n",
" 'PER1',\n",
" 'AI839979',\n",
" 'ID1',\n",
" 'PPP1R10',\n",
" 'PTP4A1',\n",
" 'PTGS2',\n",
" 'TNF',\n",
" 'CIART',\n",
" 'MYC',\n",
" 'NFKBID',\n",
" 'ALKBH2',\n",
" 'KDM6B',\n",
" 'GDF15',\n",
" 'DUSP8',\n",
" 'BTG2',\n",
" 'A530013C23RIK',\n",
" 'CHAC1',\n",
" 'CCL3',\n",
" 'ODC1',\n",
" 'GM49774',\n",
" 'FOS',\n",
" 'DUSP1',\n",
" 'RECQL4',\n",
" 'CCL4',\n",
" 'IER3',\n",
" 'SPEF1L',\n",
" 'E230013L22RIK',\n",
" 'CSRNP1',\n",
" 'PHLDA1',\n",
" 'PLK3',\n",
" 'DUSP5',\n",
" '9930022D16RIK',\n",
" 'OSM',\n",
" 'IL10',\n",
" 'MYBPC3',\n",
" 'IER2',\n",
" 'FOSB',\n",
" 'CXCL2',\n",
" 'EDN1',\n",
" 'NR4A1',\n",
" 'GM42793',\n",
" 'GM22748',\n",
" 'EGR1',\n",
" 'EGR2',\n",
" 'EGR3'\n",
"]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Load mm10 gtf file"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"gtf_df = pd.read_csv(PATH + '/gencode.vM25.annotation.gtf.gz',\n",
" sep='\\t', comment='#', compression='gzip',\n",
" names=['chromosome', 'db', 'type', 'start', 'end', 'point1', 'strand', 'point2', 'aux'])\n",
"gtf_df.sample(10)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from tqdm.auto import tqdm\n",
"import re\n",
"\n",
"print('Parse GTF aux data')\n",
"auxes = {}\n",
"for i, aux in enumerate(tqdm(gtf_df['aux'])):\n",
" for pair in aux.split(';'):\n",
" kv = pair.strip().split(' ')\n",
" if len(kv) != 2:\n",
" continue\n",
" k, v = kv\n",
" if k not in auxes:\n",
" auxes[k] = vs = []\n",
" else:\n",
" vs = auxes[k]\n",
" vs.append(v.strip('\"'))\n",
"\n",
"for k, vs in auxes.items():\n",
" if len(vs) == len(gtf_df):\n",
" gtf_df[k] = vs\n",
" else:\n",
" print(f'Ignoring {k}')\n",
"del auxes\n",
"gtf_df.drop('aux', axis=1, inplace=True)\n",
"\n",
"# Fix . in gene_id\n",
"gtf_df['gene_id'] = [re.sub('\\..*', '', id) for id in gtf_df['gene_id']]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"print(f'Total mm10 records {len(gtf_df)}')\n",
"print(f'Total mm10 genes {sum(gtf_df[\"type\"] == \"gene\")}')\n",
"print(f'Total mm10 protein_coding genes {sum((gtf_df[\"type\"] == \"gene\") & (gtf_df[\"gene_type\"] == \"protein_coding\"))}')\n",
"\n",
"gtf_genes_df = gtf_df[gtf_df['type'] == 'gene']\n",
"gtf_genes_df.sample(5)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"print('Total genes', len(GENES))\n",
"gft_selected_genes_df = gtf_genes_df[gtf_genes_df['gene_name'].str.upper().isin(set(GENES))].copy()\n",
"gft_selected_genes_df.drop_duplicates(['gene_name'], inplace=True)\n",
"not_found_genes = [g for g in GENES if g.upper() not in gft_selected_genes_df['gene_name'].str.upper().values]\n",
"print('Not found genes', len(not_found_genes))\n",
"print(not_found_genes)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"RANGES = [1000, 2000, 5000]\n",
"\n",
"for tss_range in RANGES:\n",
" gft_selected_genes_df[f'tss_start_{tss_range}'] = [start - tss_range if strand == '+' else end - tss_range\n",
" for start, end, strand in zip(gft_selected_genes_df['start'],\n",
" gft_selected_genes_df['end'],\n",
" gft_selected_genes_df['strand'])]\n",
" gft_selected_genes_df[f'tss_end_{tss_range}'] = [start + tss_range if strand == '+' else end + tss_range\n",
" for start, end, strand in zip(gft_selected_genes_df['start'],\n",
" gft_selected_genes_df['end'],\n",
" gft_selected_genes_df['strand'])]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Load bigwigs"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"PATH_BW = PATH + '/bw_20mln'\n",
"\n",
"REPLICATES = [1, 2, 3, 4]\n",
"TIMEPOINTS = ['Alone', '15min', '30min', '45min']\n",
"\n",
"\n",
"def load_bws(path):\n",
" df_bws = pd.DataFrame(columns=['file', 'timepoint', 'replicate'], dtype=object)\n",
" for f in tqdm(os.listdir(path)):\n",
" if '.bw' not in f:\n",
" continue\n",
" timepoint = next((t for t in TIMEPOINTS if t in f), None)\n",
" replicate = next((r for r in REPLICATES if f'-{r}_' in f), None)\n",
" if timepoint and replicate:\n",
" df_bws.loc[len(df_bws)] = ((os.path.join(path, f)), timepoint, replicate)\n",
" return df_bws\n",
"\n",
"\n",
"bws_df = load_bws(PATH_BW)\n",
"bws_df.sample(3)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"CHROM_SIZES = {\n",
" c: s for _, (c, s) in pd.read_csv(os.path.join(PATH, 'mm10.chrom.sizes'),\n",
" sep='\\t', names=['chr', 'size']).iterrows() if '_' not in c\n",
"}\n",
"CHROM_SIZES"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from itertools import product\n",
"import pyBigWig\n",
"\n",
"total_coverages = {}\n",
"\n",
"for t, r in tqdm(product(TIMEPOINTS, REPLICATES)):\n",
" bws_df_tr = bws_df[(bws_df['timepoint'] == t) & (bws_df['replicate'] == r)]\n",
" if len(bws_df_tr) == 0:\n",
" continue\n",
" bw_file = bws_df_tr['file'].values[0]\n",
" with pyBigWig.open(bw_file) as bw:\n",
" total_coverage = sum(bw.stats(chr, type='sum', exact=True)[0] for chr in CHROM_SIZES)\n",
" total_coverages[(t, r)] = total_coverage\n",
"total_coverages"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"coverage_data = []\n",
"\n",
"for t, r in tqdm(product(TIMEPOINTS, REPLICATES)):\n",
" bws_df_tr = bws_df[(bws_df['timepoint'] == t) & (bws_df['replicate'] == r)]\n",
" if len(bws_df_tr) == 0:\n",
" continue\n",
" bw_file = bws_df_tr['file'].values[0]\n",
" with pyBigWig.open(bw_file) as bw:\n",
" for tss_range in RANGES:\n",
" for gene_name, chromosome, tss_start, tss_end in zip(\n",
" gft_selected_genes_df['gene_name'],\n",
" gft_selected_genes_df['chromosome'],\n",
" gft_selected_genes_df[f'tss_start_{tss_range}'],\n",
" gft_selected_genes_df[f'tss_end_{tss_range}']\n",
" ):\n",
" if tss_start < 0:\n",
" print('Negative tss_start', gene_name, tss_start)\n",
" continue\n",
" if tss_end > CHROM_SIZES[chromosome]:\n",
" print('Out of bounds tss_end', gene_name, tss_end, CHROM_SIZES[chromosome])\n",
" continue\n",
" if tss_start > tss_end:\n",
" print('Illegal range', tss_start, tss_end)\n",
" continue\n",
" tss_coverage = bw.stats(chromosome, tss_start, tss_end, type='sum', exact=True)[0]\n",
" coverage_data.append((t, r, gene_name, chromosome, tss_range, tss_start, tss_end,\n",
" tss_end - tss_start, tss_coverage, total_coverages[(t, r)]))\n",
"\n",
"df_coverage = pd.DataFrame(\n",
" coverage_data,\n",
" columns=['timepoint', 'replicate', 'gene', 'chromosome', 'range', 'tss_start', 'tss_end',\n",
" 'length', 'coverage', 'total_coverage']\n",
")\n",
"del coverage_data\n",
"df_coverage.sample(5)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"df_coverage['rpkm'] = [c / (total_coverages[(t, r)] / 1e6) / (l / 1e3) for t, r, l, c in\n",
" zip(\n",
" df_coverage['timepoint'],\n",
" df_coverage['replicate'],\n",
" df_coverage['length'],\n",
" df_coverage['coverage']\n",
" )]\n",
"df_coverage['name'] = df_coverage['timepoint'] + ' ' + df_coverage['replicate'].astype(str)\n",
"df_coverage.sample(3)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from scipy.stats import zscore\n",
"\n",
"cmap = factors_colormap(4)\n",
"\n",
"\n",
"def col_color(c):\n",
" for i, t in enumerate(TIMEPOINTS):\n",
" if c.startswith(t):\n",
" return cmap(i)\n",
"\n",
"\n",
"for tss_range in RANGES:\n",
" print(f'TSS +-{tss_range} ATAC-seq coverage')\n",
" t = df_coverage[df_coverage['range'] == tss_range].pivot(index='gene', columns='name', values='rpkm')\n",
" t = t[[f'{t} {r}' for t, r in product(TIMEPOINTS, REPLICATES) if not (t == 'Alone' and r == 1)]].copy()\n",
" t = zscore(t, axis=1)\n",
" # print(t.isna())\n",
" # plt.figure(figsize=(5, 12))\n",
" col_colors = [col_color(c) for c in t.columns]\n",
" sns.clustermap(t,\n",
" # z_scale=0,\n",
" col_cluster=False, row_cluster=True,\n",
" col_colors=col_colors,\n",
" figsize=(3, 7),\n",
" cmap=plt.cm.coolwarm,\n",
" cbar_pos=(1, .0, .03, .4))\n",
" # plt.tight_layout()\n",
" plt.savefig(f'{PATH}/pics/tss_{tss_range}.pdf', bbox_inches='tight', dpi=300)\n",
" plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Coverage in closest diff peaks to TSS"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import tempfile\n",
"\n",
"tf = tempfile.mktemp()\n",
"\n",
"t = gft_selected_genes_df[['chromosome', 'tss_start_1000', 'tss_end_1000', 'gene_name']].copy()\n",
"t.sort_values(by=['chromosome', 'tss_start_1000', 'tss_end_1000'], inplace=True)\n",
"t.to_csv(PATH + '/genes_tss.bed', header=None, index=False, sep='\\t')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"! bedtools closest -a {PATH} /genes_tss.bed -b {PATH} /all_peaks.bed > {tf}\n",
"print(tf)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"t = pd.read_csv(f'{PATH}/all_peaks.bed',\n",
" names=['chromosome', 'start', 'end', 'ensembleid', 'strand', 'position', 'distance'],\n",
" sep='\\t')\n",
"t.sample(3)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"associated_peaks = pd.merge(\n",
" left=t, right=gtf_genes_df[['gene_id', 'gene_name']], left_on='ensembleid', right_on='gene_id'\n",
")\n",
"associated_peaks.sort_values(by=['gene_id', 'distance'], inplace=True)\n",
"associated_peaks.drop_duplicates(['gene_id'], inplace=True)\n",
"associated_peaks"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"print('Total genes', len(GENES))\n",
"associated_peaks_genes = associated_peaks[associated_peaks['gene_name'].str.upper().isin(set(GENES))].copy()\n",
"not_found_genes = [g for g in GENES if g.upper() not in associated_peaks['gene_name'].str.upper().values]\n",
"print('Not found genes', len(not_found_genes))\n",
"print(not_found_genes)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"coverage_data = []\n",
"\n",
"for t, r in tqdm(product(TIMEPOINTS, REPLICATES)):\n",
" bws_df_tr = bws_df[(bws_df['timepoint'] == t) & (bws_df['replicate'] == r)]\n",
" if len(bws_df_tr) == 0:\n",
" continue\n",
" bw_file = bws_df_tr['file'].values[0]\n",
" with pyBigWig.open(bw_file) as bw:\n",
" for gene_name, chromosome, start, end in zip(\n",
" associated_peaks_genes['gene_name'],\n",
" associated_peaks_genes['chromosome'],\n",
" associated_peaks_genes['start'],\n",
" associated_peaks_genes['end']\n",
" ):\n",
" coverage = bw.stats(chromosome, start, end, type='sum', exact=True)[0]\n",
" coverage_data.append((t, r, gene_name, chromosome, start, end,\n",
" end - start, coverage, total_coverages[(t, r)]))\n",
"\n",
"df_coverage_peaks = pd.DataFrame(\n",
" coverage_data,\n",
" columns=['timepoint', 'replicate', 'gene', 'chromosome', 'start', 'end',\n",
" 'length', 'coverage', 'total_coverage']\n",
")\n",
"del coverage_data\n",
"df_coverage_peaks.sample(5)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"df_coverage_peaks['rpkm'] = [c / (total_coverages[(t, r)] / 1e6) / (l / 1e3) for t, r, l, c in\n",
" zip(\n",
" df_coverage_peaks['timepoint'],\n",
" df_coverage_peaks['replicate'],\n",
" df_coverage_peaks['length'],\n",
" df_coverage_peaks['coverage']\n",
" )]\n",
"df_coverage_peaks['name'] = df_coverage_peaks['timepoint'] + ' ' + df_coverage_peaks['replicate'].astype(str)\n",
"df_coverage_peaks.sample(3)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"print('Associated peaks coverage')\n",
"t = df_coverage_peaks.pivot(index='gene', columns='name', values='rpkm')\n",
"t = t[[f'{t} {r}' for t, r in product(TIMEPOINTS, REPLICATES) if not (t == 'Alone' and r == 1)]].copy()\n",
"t = zscore(t, axis=1)\n",
"# print(t.isna())\n",
"# plt.figure(figsize=(5, 12))\n",
"col_colors = [col_color(c) for c in t.columns]\n",
"sns.clustermap(t,\n",
" col_cluster=False, row_cluster=True,\n",
" col_colors=col_colors,\n",
" figsize=(3, 7),\n",
" cmap=plt.cm.coolwarm,\n",
" cbar_pos=(1, .0, .03, .4))\n",
"# plt.tight_layout()\n",
"plt.savefig(f'{PATH}/pics/associated_peaks.pdf', bbox_inches='tight', dpi=300)\n",
"plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"# Signal profile"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"BINS = 100\n",
"coverage_data = []\n",
"\n",
"for t, r in tqdm(product(TIMEPOINTS, REPLICATES)):\n",
" bws_df_tr = bws_df[(bws_df['timepoint'] == t) & (bws_df['replicate'] == r)]\n",
" if len(bws_df_tr) == 0:\n",
" continue\n",
" bw_file = bws_df_tr['file'].values[0]\n",
" with pyBigWig.open(bw_file) as bw:\n",
" for tss_range in RANGES:\n",
" for gene_name, chromosome, tss_start, tss_end in zip(\n",
" gft_selected_genes_df['gene_name'],\n",
" gft_selected_genes_df['chromosome'],\n",
" gft_selected_genes_df[f'tss_start_{tss_range}'],\n",
" gft_selected_genes_df[f'tss_end_{tss_range}']\n",
" ):\n",
" if tss_start < 0:\n",
" print('Negative tss_start', gene_name, tss_start)\n",
" continue\n",
" if tss_end > CHROM_SIZES[chromosome]:\n",
" print('Out of bounds tss_end', gene_name, tss_end, CHROM_SIZES[chromosome])\n",
" continue\n",
" if tss_start > tss_end:\n",
" print('Illegal range', tss_start, tss_end)\n",
" continue\n",
" tss_coverages = bw.stats(chromosome, tss_start, tss_end, nBins=BINS, type='sum', exact=True)\n",
" coverage_data.append((t, r, gene_name, chromosome, tss_range, tss_start, tss_end,\n",
" tss_end - tss_start, *(tss_coverages[i] for i in range(BINS)),\n",
" total_coverages[(t, r)]))\n",
"\n",
"df_coverage_bins = pd.DataFrame(\n",
" coverage_data,\n",
" columns=['timepoint', 'replicate', 'gene', 'chromosome', 'range', 'tss_start', 'tss_end',\n",
" 'length', *(f'coverage_{i}' for i in range(BINS)), 'total_coverage']\n",
")\n",
"del coverage_data\n",
"df_coverage_bins.sample(5)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"for i in range(BINS):\n",
" df_coverage_bins[str(i)] = [c / (total_coverages[(t, r)] / 1e6) / (l / 1e3) * BINS for t, r, l, c in\n",
" zip(\n",
" df_coverage_bins['timepoint'],\n",
" df_coverage_bins['replicate'],\n",
" df_coverage_bins['length'],\n",
" df_coverage_bins[f'coverage_{i}']\n",
" )]\n",
"df_coverage_bins.sample(5)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"t = df_coverage_bins[['gene', 'timepoint', 'range', *(str(i) for i in range(BINS))]].groupby(['gene', 'timepoint', 'range']).mean()\n",
"t['rpkm'] = t[[str(i) for i in range(BINS)]].sum(axis=1)\n",
"t.sort_values(by=['rpkm'], ascending=False, inplace=True)\n",
"t.drop(columns=['rpkm'], inplace=True)\n",
"t.reset_index(inplace=True)\n",
"t.sample(5)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# print(t.isna())\n",
"# plt.figure(figsize=(5, 12))\n",
"for tp, tss_range in product(TIMEPOINTS, RANGES):\n",
" print(f'Profile {tp} {tss_range}')\n",
" tt = t[(t['timepoint'] == tp) & (t['range'] == tss_range)].copy()\n",
" if len(tt) == 0:\n",
" continue\n",
" tt.index = tt['gene']\n",
" g = sns.clustermap(tt[[str(i) for i in range(BINS)]],\n",
" col_cluster=False, row_cluster=False,\n",
" figsize=(3, 6),\n",
" cmap=plt.cm.Blues,\n",
" cbar_pos=(1, .0, .03, .4))\n",
" g.ax_heatmap.set_xticks([0, int(BINS / 2), BINS - 1], minor=False)\n",
" g.ax_heatmap.set_xticklabels([-tss_range, 0, tss_range], rotation=0)\n",
" g.ax_heatmap.set_title(f'Profile {tp} {tss_range}')\n",
" # g.ax_heatmap.set_yticks([])\n",
" # plt.tight_layout()\n",
" plt.savefig(f'{PATH}/pics/heatmap_{tp}_{tss_range}.pdf', bbox_inches='tight', dpi=300)\n",
" plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# print(t.isna())\n",
"plt.figure(figsize=(15, 12))\n",
"axs = [plt.subplot(len(TIMEPOINTS), len(RANGES), i + 1) for i in range(len(TIMEPOINTS) * len(RANGES))]\n",
"for i, (tp, tss_range) in enumerate(product(TIMEPOINTS, RANGES)):\n",
" print(f'Profile {tp} {tss_range}')\n",
" tt = t[(t['timepoint'] == tp) & (t['range'] == tss_range)].copy()\n",
" tt.index = tt['gene']\n",
" tt = tt[[str(i) for i in range(BINS)]].stack().reset_index()\n",
" tt.columns = ['gene', 'i', 'rpkm']\n",
" g = sns.lineplot(data=tt,\n",
" x='i', y='rpkm',\n",
" errorbar='se',\n",
" ax=axs[i]\n",
" )\n",
" g.axes.set_xticks([0, int(BINS / 2), BINS - 1], minor=False)\n",
" g.axes.set_xticklabels([-tss_range, 0, tss_range], rotation=0)\n",
" g.axes.set_title(f'Profile {tp} {tss_range}')\n",
" g.axes.set_xlabel('distance to tss')\n",
"plt.tight_layout()\n",
"plt.savefig(f'{PATH}/pics/profiles.pdf', bbox_inches='tight', dpi=300)\n",
"plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# print(t.isna())\n",
"# plt.figure(figsize=(5, 12))\n",
"for tss_range in RANGES:\n",
" print(f'Profile {tss_range}')\n",
" plt.figure(figsize=(5, 3))\n",
" tt = t[t['range'] == tss_range].copy()\n",
" tt.index = tt['gene']\n",
" ts = []\n",
" for tp in TIMEPOINTS:\n",
" ttt = tt[tt['timepoint'] == tp][[str(i) for i in range(BINS)]].stack().reset_index()\n",
" ttt.columns = ['gene', 'i', 'rpkm']\n",
" ttt['timepoint'] = tp\n",
" ts.append(ttt)\n",
" tt = pd.concat(ts).reset_index(drop=True)\n",
" g = sns.lineplot(data=tt,\n",
" hue='timepoint',\n",
" x='i', y='rpkm',\n",
" errorbar='se'\n",
" )\n",
" g.axes.set_xticks([0, int(BINS / 2), BINS - 1], minor=False)\n",
" g.axes.set_xticklabels([-tss_range, 0, tss_range], rotation=0)\n",
" g.axes.set_title(f'Profile {tss_range}')\n",
" g.axes.set_xlabel('distance to tss')\n",
" plt.tight_layout()\n",
" plt.savefig(f'{PATH}/pics/profile_{tss_range}.pdf', bbox_inches='tight', dpi=300)\n",
" plt.show()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# ATAC-seq & pro-seq & rna-seq analysis
### Analysis of all ATAC-seq datasets
```
# chipseq-smk-pipeline
for FDR in 0.05; do
snakemake all --use-conda --cores all --directory /mnt/stripe/shpynov/2022_Atac-Seq-2/ --config fastq_ext=fastq.gz fastq_dir=/mnt/stripe/shpynov/2022_Atac-Seq-2/fastq genome=mm10 bowtie2_params="-X 2000 --dovetail" macs2_mode=narrow macs2_params="-q $FDR -f BAMPE --nomodel --nolambda -B --call-summits" macs2_suffix="q$FDR";
done
```
### Reads with additional adapters trimming Trimmomatic
```
# Check trimming with trimmomatic
cd 2022_Atac-Seq-2/fastq
for F in *_R1_*; do
echo $F; N=${F/_R1_.fastq.gz/}; PF=${N}_R2_.fastq.gz; echo $PF;
trimmomatic PE -threads 8 -trimlog $N.txt -phred33 $F $PF ../trimmed/$F ../trimmed/${N}_unpaired_R1_.fastq.gz ../trimmed/$PF ../trimmed/${N}_unpaired_R2_.fastq.gz ILLUMINACLIP:/home/user/miniconda3/envs/bio/share/trimmomatic/adapters/NexteraPE-PE.fa:2:30:10:8:true;
done
```
### And reprocess all data
```
for FDR in 0.05; do snakemake all --use-conda --cores all --directory /mnt/stripe/shpynov/2022_Atac-Seq-2/ --config fastq_ext=fastq.gz fastq_dir=/mnt/stripe/shpynov/2022_Atac-Seq-2/trimmed genome=mm10 bowtie2_params="-X 2000 --dovetail" macs2_mode=narrow macs2_params="-q $FDR -f BAMPE --nomodel --nolambda -B --call-summits" macs2_suffix="q$FDR" -n; done
```
### Check non-aligned reads vs hg38
```
cd /mnt/stripe/shpynov/2022_Atac-Seq-2/bams
for F in *.bam; do
echo $F; N=${F/.bam/};
echo "Filtering"
# Extract unmapped read whose mate is mapped
samtools view -u -f 4 -F 264 $F > ../unmapped/${N}_unmapped_1.bam;
# Extract mapped read whose mate is unmapped
samtools view -u -f 8 -F 260 $F > ../unmapped/${N}_unmapped_2.bam;
# Extract unmapped read with unmapped mate
samtools view -u -f 12 -F 256 $F > ../unmapped/${N}_unmapped_3.bam;
echo "Merging"
samtools merge ../unmapped/${N}_unmapped.bam ../unmapped/${N}_unmapped_*.bam;
samtools sort -n -o ../unmapped/${N}_unmapped_sorted.bam ../unmapped/${N}_unmapped.bam;
echo "Bam to fastq"
bedtools bamtofastq -i ../unmapped/${N}_unmapped_sorted.bam -fq ../unmapped/${N}_unmapped.fastq;
rm ../unmapped/${N}_unmapped_*.bam;
done
```
### Subsample all libraries to 20mln
```
# Move previous to _full suffix
READS=20000000
for F in *.bam; do
echo $F
FRACTION=$(samtools idxstats $F | cut -f3 | awk -v ct=$READS 'BEGIN {total=0} {total += $1} END {print ct/total}')
echo $FRACTION
samtools view -@ 4 -b -s ${FRACTION} $F > ${F/.bam/_20mln.bam}
done
# Visualization
for F in *.bam; do echo $F; bamCoverage -b $F -p 6 -o ${F/.bam/.bw}; done
# MACS2 peak calling
for F in *.bam; do echo $F; macs2 callpeak -t $F -g mm -q 0.05 -f BAMPE --nomodel --nolambda -B --call-summits -n ${F/.bam/_q0.05} &> | tee ${F/.bam/_macs2.log};
done
# SPAN peak calling
for F in *.bam; do echo $F; java -jar ~/span-1.0.build.jar analyze -t $F -cs mm10.chrom.sizes --fdr 0.05 -peaks ${F/.bam/_q0.05.peak} &> ${F/.bam/_span.log};
done
```
### Compute consensus peaks for all the groups
```
# IMPORTANT: Call bedtools merge after multiiter!
cd 2022_Atac-Seq-2/macs2
T=$(mktemp);
for F in Alone 15min 30min 45min; do echo $F; bedtools multiinter -i $(ls *$F*.narrowPeak) | grep -E '.*(.*,){2,}.*' | awk -v OFS='\t' '{print $1,$2,$3}' > $T; bedtools merge -i $T > ${F}_macs2_cons3.bed3;
done
cd 2022_Atac-Seq-2/span
for F in Alone 15min 30min 45min; do echo $F; bedtools multiinter -i $(ls *$F*.peak) | grep -E '.*(.*,){2,}.*' | awk -v OFS='\t' '{print $1,$2,$3}' > $T; bedtools merge -i $T > ${F}_span_cons3.bed3;
done
```
### R downstream analysis of consensus peaks
```
library(ChIPpeakAnno)
m15 = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln/15min_macs2_cons3.bed3", format="BED", header=FALSE)
m30 = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln/30min_macs2_cons3.bed3", format="BED", header=FALSE)
m45 = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln//45min_macs2_cons3.bed3", format="BED", header=FALSE)
ma = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln//Alone_macs2_cons3.bed3", format="BED", header=FALSE)
ol <- findOverlapsOfPeaks(m15, m30, m45, ma)
png('/home/oshpynov/data/2022_Atac-Seq-2/venn.png')
makeVennDiagram(ol)
dev.off()
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
peaks <- GRangesList(m15=m15, m30=m30, m45=m45, ma=ma)
png('/home/oshpynov/data/2022_Atac-Seq-2/distribution.png', width=6, height=3, units="in", res=1200, pointsize = 1)
genomicElementDistribution(peaks, TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene, promoterRegion=c(upstream=2000, downstream=500), geneDownstream=c(upstream=0, downstream=2000))
dev.off()
overlaps <- ol$peaklist[['m15///m30///m45///ma']]
library(EnsDb.Mmusculus.v79)
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene")
png('/home/oshpynov/data/2022_Atac-Seq-2/anno.png', width=5, height=3, units="in", res=1200, pointsize = 1)
binOverFeature(overlaps, annotationData=annoData,radius=5000, nbins=20, FUN=length, errFun=0, xlab="distance from TSS (bp)", ylab="count", main="Distribution of aggregated peak numbers around TSS")
dev.off()
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500))
library(org.Mm.eg.db)
overlaps.anno <- addGeneIDs(overlaps.anno, "org.Mm.eg.db", IDs2Add = "entrez_id")
write.csv(as.data.frame(unname(overlaps.anno)), "anno.csv")
over <- getEnrichedGO(overlaps.anno, orgAnn="org.Mm.eg.db", condense=TRUE)
png('/home/oshpynov/data/2022_Atac-Seq-2/go.png', width=10, height=4, units="in", res=1200, pointsize = 1)
enrichmentPlot(over)
dev.off()
```
### Diffbind analysis
```
# Create DiffBind config
cd /mnt/stripe/shpynov/2022_Atac-Seq-2/diffbind
echo "SampleID,Tissue,Factor,Condition,Replicate,bamReads,ControlID,bamControl,Peaks,PeakCaller" > config.csv
for F in Mac-15min Mac-30min_AC Mac-45min_AC MacAlone; do for R in 1 2 3 4; do SAMPLE="$F-$R"; BAMREADS=$(find ~/data/2022_Atac-Seq-2/bams_20mln -name "$F*-${R}_20mln.bam"); PEAKS=$(find ~/data/2022_Atac-Seq-2/macs2_20mln -name "$F*-$R*.narrowPeak"); echo "$SAMPLE,,Type,$F,$R,$BAMREADS,,,$PEAKS,bed" >> config.csv; done; done
# R code
library(DiffBind)
setwd("~/data/2022_Atac-Seq-2/diff_alone_vs_45/")
samples = dba(sampleSheet = 'config.csv')
sample_counts = dba.count(samples)
png('counts.png')
plot(sample_counts)
dev.off()
sample_contrast = dba.contrast(sample_counts)
sample_contrast = dba.analyze(sample_contrast)
png('overlap.png')
overlap_rate = dba.overlap(sample_contrast, mode = DBA_OLAP_RATE)
plot(overlap_rate, type = "b", ylab = "# peaks", xlab = "Overlap at least this many peaksets")
dev.off()
png('pca.png')
dba.plotPCA(sample_contrast)
dev.off()
png('ma.png')
dba.plotMA(sample_contrast)
dev.off()
png('volcano.png')
dba.plotVolcano(sample_contrast)
dev.off()
sample_diff = dba.report(sample_contrast)
write.table(sample_diff, 'diff_alone_vs_45.csv', sep = ",", row.names = FALSE)
# BASH csv to bed3
cat diff_alone_vs_45.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_alone_vs_45.bed3
```
### Analysis of differential peaks
```
# Top 1000 most significant differential peaks
cat alone_vs_45.csv | grep -v 'start' | head -n 1000 | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > alone_vs_45_top1000.bed3
# Motif enrichment analysis
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl alone_vs_45_top1000.bed3 mm10 alone_vs_45_top1000_motifs -size 1000 -mask;
# Functional annotation analysis with ChIPSeeker
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
peakAnno <- annotatePeak(peaks, tssRegion=c(-3000, -3000), TxDb=txdb, annoDb="org.Mm.eg.db")
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/peakanno.png', width=5, height=2, units="in", res=1200, pointsize = 1)
plotAnnoBar(peakAnno)
dev.off()
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/tss.png', width=10, height=4, units="in", res=1200, pointsize = 1)
plotDistToTSS(peakAnno, title="Distribution relative to TSS")
dev.off()
# Functional annotation analysis with ChIPPeakAnno
library(ChIPpeakAnno)
peaks = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/diff/alone_vs_45.bed3", format="BED", header=FALSE)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/distribution.png', width=6, height=3, units="in", res=1200, pointsize = 1)
genomicElementDistribution(peaks, TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene, promoterRegion=c(upstream=2000, downstream=500), geneDownstream=c(upstream=0, downstream=2000))
dev.off()
library(EnsDb.Mmusculus.v79)
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene")
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/anno.png')
binOverFeature(peaks, annotationData=annoData,radius=5000, nbins=20, FUN=length, errFun=0, xlab="distance from TSS (bp)", ylab="count", main="Distribution of peaks around TSS")
dev.off()
library(org.Mm.eg.db)
peaks.anno <- annotatePeakInBatch(peaks, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500))
peaks.anno <- addGeneIDs(peaks.anno, "org.Mm.eg.db", IDs2Add = "entrez_id")
write.csv(as.data.frame(unname(peaks.anno)), "anno.csv")
over <- getEnrichedGO(peaks.anno, orgAnn="org.Mm.eg.db", condense=TRUE)
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/go.png', width=10, height=4, units="in", res=1200, pointsize = 1)
enrichmentPlot(over)
dev.off()
```
### Motif enrichment with background
```
# Motif enrichment analysis with background
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl alone_vs_45_top1000.bed3 mm10 alone_vs_45_top1000_motifs_vs_background -size 1000 -mask -bg alone_vs_45_background.bed3;
```
### Prepare background
```
# Prepare all peaks
T=$(mktemp)
bedtools multiinter -i *.narrowPeak | awk -v OFS='\t' '{print $1,$2,$3}' > $T;
bedtools merge -i $T > all_peaks.bed3
# Prepare background as (all_peaks - peaks) + peaks
bedtools subtract -A -a ../macs2_20mln/all_peaks.bed3 -b alone_vs_45.bed3 > $T;
bedtools multiinter -i $T alone_vs_45.bed3 | sort -k1,1 -k2,2n > $T.2;
bedtools merge -i $T.2 > alone_vs_45_background.bed3;
```
### Diffbind pairwise analysis
```
# R code
library(DiffBind)
# Use one of these ...
setwd("~/data/2022_Atac-Seq-2/diff_alone_vs_15/")
setwd("~/data/2022_Atac-Seq-2/diff_alone_vs_30/")
setwd("~/data/2022_Atac-Seq-2/diff_15_vs_45/")
samples = dba(sampleSheet = 'config.csv')
sample_counts = dba.count(samples)
peaksets <- dba.peakset(sample_counts, bRetrieve = TRUE)
png('counts.png')
plot(sample_counts)
dev.off()
sample_contrast = dba.contrast(sample_counts)
sample_contrast = dba.analyze(sample_contrast)
png('overlap.png')
overlap_rate = dba.overlap(sample_contrast, mode = DBA_OLAP_RATE)
plot(overlap_rate, type = "b", ylab = "# peaks", xlab = "Overlap at least this many peaksets")
dev.off()
png('pca.png')
dba.plotPCA(sample_contrast)
dev.off()
png('ma.png')
dba.plotMA(sample_contrast)
dev.off()
png('volcano.png')
dba.plotVolcano(sample_contrast)
dev.off()
sample_diff = dba.report(sample_contrast)
write.table(sample_diff, 'diff_alone_vs_15.csv', sep = ",", row.names = FALSE)
# BASH csv to bed3
cat diff_alone_vs_15.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_alone_vs_15.bed3
# BASH csv to bed3
cat diff_alone_vs_30.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_alone_vs_30.bed3
# BASH csv to bed3
cat diff_15_vs_45.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_15_vs_45.bed3
```
### Compute all differential peaks union
```
# Compute all differential peaks union
T=$(mktemp)
bedtools multiinter -i $(find . -name "diff*.bed3") | awk -v OFS='\t' '{print $1,$2,$3}' > $T
bedtools merge -i $T > diff_pairwise.bed3
# Create DiffBind config
cd /data/2022_Atac-Seq-2/diffbind
echo "SampleID,Tissue,Factor,Condition,Replicate,bamReads,ControlID,bamControl,Peaks,PeakCaller" > config.csv
for F in Mac-15min Mac-30min_AC Mac-45min_AC MacAlone; do for R in 1 2 3 4; do SAMPLE="$F-$R"; BAMREADS=$(find /data/2022_Atac-Seq-2/bams_20mln -name "$F*-${R}_20mln.bam"); echo "$SAMPLE,,Type,$F,$R,$BAMREADS,,,/data/2022_Atac-Seq-2/diff_pairwise.bed3,bed" >> config.csv; done; done
# R code for counts and genes annotating
library(DiffBind)
samples = dba(sampleSheet = 'config.csv')
sample_counts = dba.count(samples)
peaksets <- dba.peakset(sample_counts, bRetrieve = TRUE)
library(EnsDb.Mmusculus.v79)
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene")
library(org.Mm.eg.db)
peaksets <- annotatePeakInBatch(peaksets, AnnotationData=annoData) # Full annotation
peaksets <- addGeneIDs(peaksets, "org.Mm.eg.db", IDs2Add = "entrez_id")
write.csv(peaksets, 'diff_pairwise_counts.csv', row.names = FALSE)
# BASH
cat diff_pairwise_counts.csv | sed 's/,/\t/g' > diff_pairwise_counts.tsv
```
# Integrate with DE RNA-seq data
```
# Inspect number of DE genes with respect to change sign
cd de_filtered_tables
for F in *.csv; do
echo $F;
cat $F | grep -v log2 | wc -l;
cat $F | grep -v log2 | awk -v FS=',' '($3<0) {print $1}' | wc -l;
cat $F | grep -v log2 | awk -v FS=',' '($3>0) {print $1}' | wc -l;
done
```
# Prepare peaks for annotation with related genes
```
for F in diff_alone_vs_45/alone_vs_45.csv diff_alone_vs_15/alone_vs_15.csv diff_alone_vs_30/alone_vs_30.csv \
diff_15_vs_30/diff_15_vs_30.csv diff_15_vs_45/diff_15_vs_45.csv diff_30_vs_45/diff_30_vs_45.csv; do
echo $F
wc -l $F
cat $F | grep -v seqname | awk -v FS=',' -v OFS='\t' '($9<0) {print $1,$2,$3}' | sed 's#"##g' |\
grep -v -E 'chr[^\t]*_' | sort -k1,1 -k2,2n > ${F/.csv/_appear.bed3}
wc -l ${F/.csv/_appear.bed3}
cat $F | grep -v seqname | awk -v FS=',' -v OFS='\t' '($9>0) {print $1,$2,$3}' | sed 's#"##g' |\
grep -v -E 'chr[^\t]*_' | sort -k1,1 -k2,2n > ${F/.csv/_disappear.bed3}
wc -l ${F/.csv/_disappear.bed3}
done
```
Annotate DE ATAC-seq peaks with related genes
```R
library(ChIPpeakAnno)
library(org.Mm.eg.db)
library(EnsDb.Mmusculus.v79)
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene")
FILES = c(
"/data/2022_Atac-Seq-2/macs2_20mln/all_peaks.bed3",
"/data/2022_Atac-Seq-2/diff_alone_vs_45/alone_vs_45_appear.bed3",
"/data/2022_Atac-Seq-2/diff_alone_vs_45/alone_vs_45_disappear.bed3",
"/data/2022_Atac-Seq-2/diff_alone_vs_15/alone_vs_15_appear.bed3",
"/data/2022_Atac-Seq-2/diff_alone_vs_15/alone_vs_15_disappear.bed3",
"/data/2022_Atac-Seq-2/diff_alone_vs_30/alone_vs_30_appear.bed3",
"/data/2022_Atac-Seq-2/diff_alone_vs_30/alone_vs_30_disappear.bed3",
"/data/2022_Atac-Seq-2/diff_15_vs_30/diff_15_vs_30_appear.bed3",
"/data/2022_Atac-Seq-2/diff_15_vs_30/diff_15_vs_30_disappear.bed3",
"/data/2022_Atac-Seq-2/diff_15_vs_45/diff_15_vs_45_appear.bed3",
"/data/2022_Atac-Seq-2/diff_15_vs_45/diff_15_vs_45_disappear.bed3",
"/data/2022_Atac-Seq-2/diff_30_vs_45/diff_30_vs_45_appear.bed3",
"/data/2022_Atac-Seq-2/diff_30_vs_45/diff_30_vs_45_disappear.bed3"
)
for (file in FILES) {
print(file)
peaks = toGRanges(file, format="BED", header=FALSE)
peaks.anno <- annotatePeakInBatch(peaks, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500))
peaks.anno <- addGeneIDs(peaks.anno, "org.Mm.eg.db", IDs2Add = "entrez_id")
write.csv(as.data.frame(unname(peaks.anno)), paste(file, ".csv", sep=""))
}
```
Overlap genes from ATAC-seq DE and RNA-seq DE.\
Check different timepoints DE results vs peak in ATAC-seq.
```bash
mkdir -p rna-atac
# Filter RNA-seq DE genes versus ATAC-seq files
for DE in de_filtered_tables/45_vs_alone_filter_table.csv de_filtered_tables/90_vs_alone_filter_table.csv de_filtered_tables/180_vs_alone_filter_table.csv; do
for P in all_peaks.bed3.csv \
diff_alone_vs_45/alone_vs_45_appear.bed3.csv diff_alone_vs_45/alone_vs_45_disappear.bed3.csv \
diff_alone_vs_30/alone_vs_30_appear.bed3.csv diff_alone_vs_30/alone_vs_30_disappear.bed3.csv \
diff_alone_vs_15/alone_vs_15_appear.bed3.csv diff_alone_vs_15/alone_vs_15_disappear.bed3.csv \
diff_15_vs_30/diff_15_vs_30_appear.bed3.csv diff_15_vs_30/diff_15_vs_30_disappear.bed3.csv \
diff_15_vs_45/diff_15_vs_45_appear.bed3.csv diff_15_vs_45/diff_15_vs_45_disappear.bed3.csv \
diff_30_vs_45/diff_30_vs_45_appear.bed3.csv diff_30_vs_45/diff_30_vs_45_disappear.bed3.csv; do
echo "Differential expression $DE $(cat $DE | wc -l) vs peaks $P $(cat $P | wc -l)";
DEF=$(basename $DE | sed -E 's/\..*//');
PF=$(basename $P | sed -E 's/\..*//');
echo "Diff $DE < 0: $(cat $DE | grep -v log2 | awk -v FS=',' '($3<0) {print $8}' | wc -l) vs $P";
touch rna-atac/rna_${DEF}_gt0_vs_atac_${PF}.csv;
cat $DE | grep -v log2 | awk -v FS=',' '($3<0) {print $8}' | sed -E 's/\..*//' | while read -r ENS; do
grep $ENS $P; done > rna-atac/rna_${DEF}_gt0_vs_atac_${PF}.csv;
echo "Diff $DE > 0: $(cat $DE | grep -v log2 | awk -v FS=',' '($3>0) {print $8}' | wc -l) vs $P";
touch rna-atac/rna_${DEF}_ls0_vs_atac_${PF}.csv;
cat $DE | grep -v log2 | awk -v FS=',' '($3>0) {print $8}' | sed -E 's/\..*//' | while read -r ENS; do
grep $ENS $P; done > rna-atac/rna_${DEF}_ls0_vs_atac_${PF}.csv;
done;
done;
# Produce BED, TXT with gene name and Ensemble gene names
cd rna-atac
for F in *.csv; do
echo $F;
cat $F | awk -v FS=',' -v OFS='\t' '{print $2,$3,$4,$16,$12,$14,$15}' |\
sed 's/"//g' | sort -k1,1 -k2,2n > ${F/.csv/.bed};
cat $F | awk -v FS=',' '{print $16}' | sed 's/"//g' | sort --unique > ${F/.csv/_gene.txt};
cat $F | awk -v FS=',' '{print $8}' | sed 's/"//g' | sort --unique > ${F/.csv/_ens.txt};
wc -l ${F/.csv/_ens.txt};
done
```
Save positions of all ATAC-seq peaks associated with genes
```
cat all_peaks.bed3.csv | grep -v seqnames | awk -v FS=',' -v OFS='\t' '{print $2,$3,$4,$8,$12,$14,$15}' |\
sed 's/"//g' | sort -k1,1 -k2,2n > all_peaks.bed
cat all_peaks.bed3.csv | grep -v seqnames | awk -v FS=',' -v OFS='\t' '{print $8}' | sed 's/"//g' |\
sort --unique > all_peaks.txt
```
Analyze clusters of ATAC-seq data - launch `analysis.ipynb` jupyter notebook
```
cd ~/data/2022_Atac-Seq-2
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl diff_pairwise.bed3 mm10 diff_pairwise -mask
cd clusters
for F in *.bed3; do
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl $F mm10 ${F/.bed3/} -mask
done
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment