Last active
November 17, 2024 15:11
-
-
Save chillenb/b679d8dface65504af9aa181e971ca58 to your computer and use it in GitHub Desktop.
PySCF helper function benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "77f8f053-c2d8-4d28-aa7b-965830c7f43c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "from pyscf import lib, gto, scf, ao2mo\n", | |
| "import scipy.linalg.blas as blas" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "a652626b-3b7d-454a-ab57-cd33c69c7dae", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[{'numpy_version': '2.1.2',\n", | |
| " 'python': '3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:36:13) '\n", | |
| " '[GCC 12.3.0]',\n", | |
| " 'uname': uname_result(system='Linux', node='pizza', release='6.10.10-arch1-1-custom', version='#1 SMP Tue, 22 Oct 2024 13:11:16 +0000', machine='x86_64')},\n", | |
| " {'simd_extensions': {'baseline': ['SSE', 'SSE2', 'SSE3'],\n", | |
| " 'found': ['SSSE3',\n", | |
| " 'SSE41',\n", | |
| " 'POPCNT',\n", | |
| " 'SSE42',\n", | |
| " 'AVX',\n", | |
| " 'F16C',\n", | |
| " 'FMA3',\n", | |
| " 'AVX2',\n", | |
| " 'AVX512F',\n", | |
| " 'AVX512CD',\n", | |
| " 'AVX512_SKX',\n", | |
| " 'AVX512_CLX'],\n", | |
| " 'not_found': ['AVX512_KNL',\n", | |
| " 'AVX512_KNM',\n", | |
| " 'AVX512_CNL',\n", | |
| " 'AVX512_ICL',\n", | |
| " 'AVX512_SPR']}},\n", | |
| " {'filepath': '/home/chillenb/miniforge3/envs/fcdmftenv/lib/libmkl_rt.so.2',\n", | |
| " 'internal_api': 'mkl',\n", | |
| " 'num_threads': 48,\n", | |
| " 'prefix': 'libmkl_rt',\n", | |
| " 'threading_layer': 'intel',\n", | |
| " 'user_api': 'blas',\n", | |
| " 'version': '2024.2.2-Product'},\n", | |
| " {'filepath': '/home/chillenb/miniforge3/envs/fcdmftenv/lib/libomp.so',\n", | |
| " 'internal_api': 'openmp',\n", | |
| " 'num_threads': 48,\n", | |
| " 'prefix': 'libomp',\n", | |
| " 'user_api': 'openmp',\n", | |
| " 'version': None}]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "np.show_runtime()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "3b98cbba-bba3-48b5-977a-cde9e0f51414", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "A = np.random.random((10000,10000))\n", | |
| "B = np.random.random((10000,10000))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "5f585e90-da0e-4886-ba4b-be0bf0d96d17", | |
| "metadata": {}, | |
| "source": [ | |
| "## Some illustrative benchmarks (48 core cascade lake)\n", | |
| "\n", | |
| "### 10000 x 10000 matmul\n", | |
| "takes 1.32 s" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "de0fe9be-3152-4d7c-91c2-ef19cfc1ec7f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "1.32 s ± 1.29 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "C = np.zeros_like(A)\n", | |
| "%timeit -n 2 -r 2 np.matmul(A, B, out=C)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "af92f5d8-0402-48d5-a0a9-9bd944e4c090", | |
| "metadata": {}, | |
| "source": [ | |
| "### numpy elementwise multiplication takes ~300 ms!\n", | |
| "it's single threaded and cannot saturate the whole node's memory bandwidth" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "ab5b6e4c-c67f-4cbe-9810-9ae111282b85", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "294 ms ± 583 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "C = np.zeros_like(A)\n", | |
| "%timeit np.multiply(A, B, out=C)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "bc6eef4f-1a32-4088-90b1-e9214b0f33ed", | |
| "metadata": {}, | |
| "source": [ | |
| "### Multithreaded elementwise multiplication is 5x as fast" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "6589a04c-95fa-4ea5-9e1d-4b1e91f29090", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def mulcopy(A, B, out=None):\n", | |
| " return lib.entrywise_mul(A, B, out=out)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "4029cfc2-bf85-4f8b-8e92-7bb57d32c0cc", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "50.4 ms ± 500 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "C = np.zeros_like(A)\n", | |
| "%timeit mulcopy(A, B, out=C)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 54, | |
| "id": "e98bdb03-d35d-41c2-b80c-160db4333b8e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "Acopy = lib.omatcopy(A)\n", | |
| "Bcopy = lib.omatcopy(B)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "429c5f6e-51e9-4ddd-9d17-e1b48d43708a", | |
| "metadata": {}, | |
| "source": [ | |
| "### If you `omatcopy` everything and don't preallocate C, parallel entrywise multiplication is about 10x faster" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 55, | |
| "id": "f6e881b6-ad6e-4b4d-8b9c-997783b4ffe6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "31.6 ms ± 4.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%timeit mulcopy(Acopy, Bcopy)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "12fb1600-0d0b-495e-9aa8-8570242940cf", | |
| "metadata": {}, | |
| "source": [ | |
| "## Somehow, matrix multiplication is also faster now, even though it is compute-bound" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "0165a874-32d5-415b-b8e9-cb41cde9d127", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "847 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "C = np.empty_like(Acopy)\n", | |
| "%timeit np.matmul(Acopy, Bcopy, out=C)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "7401356a-f5f4-46b3-9def-91736e1b2de3", | |
| "metadata": {}, | |
| "source": [ | |
| "## Reason:\n", | |
| "### **`omatcopy` (multithreaded matrix copy) spreads out A, B and C over both NUMA nodes**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "b13fbd1e-abbe-4b96-be52-904568e0006d", | |
| "metadata": {}, | |
| "source": [ | |
| "# Application: Speeding up a _GW_ calculation\n", | |
| "\n", | |
| "The following code is from `pyscf/gw/gw_ac.py`." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "id": "cd4dae9b-8c28-46e5-87a4-e1cfb52de398", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def get_rho_response(omega, mo_energy, Lpq):\n", | |
| " '''\n", | |
| " Compute density response function in auxiliary basis at freq iw\n", | |
| " '''\n", | |
| " naux, nocc, nvir = Lpq.shape\n", | |
| " eia = mo_energy[:nocc,None] - mo_energy[None,nocc:]\n", | |
| " eia = eia/(omega**2+eia*eia)\n", | |
| " Pia = lib.einsum('Pia,ia->Pia',Lpq,eia)\n", | |
| " # Response from both spin-up and spin-down density\n", | |
| " Pi = 4. * lib.einsum('Pia,Qia->PQ',Pia,Lpq)\n", | |
| "\n", | |
| " return Pi" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "4c2497e9-a4ae-4e62-b57b-82dd81e66462", | |
| "metadata": {}, | |
| "source": [ | |
| "Set up an example calculation." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "id": "d78c512d-7b8a-4ff8-a39d-1b5508230472", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "converged SCF energy = -383.474770909024\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "np.float64(-383.47477090902385)" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "naphthalene = '''\n", | |
| " C 2.4044 0.7559 0.0000\n", | |
| " C 2.4328 -0.6584 0.0000\n", | |
| " C 1.2672 -1.3753 0.0000\n", | |
| " C 0.0142 -0.7050 0.0000\n", | |
| " C -0.0142 0.7048 0.0000\n", | |
| " C 1.2108 1.4252 0.0000\n", | |
| " C -1.2672 1.3754 0.0000\n", | |
| " C -2.4328 0.6585 0.0000\n", | |
| " C -2.4043 -0.7558 0.0000\n", | |
| " C -1.2108 -1.4254 0.0000\n", | |
| " H 3.3509 1.3062 0.0000\n", | |
| " H 3.4006 -1.1703 0.0000\n", | |
| " H 1.2810 -2.4710 0.0000\n", | |
| " H 1.1803 2.5206 0.0000\n", | |
| " H -1.2808 2.4710 0.0000\n", | |
| " H -3.4008 1.1701 0.0000\n", | |
| " H -3.3508 -1.3060 0.0000\n", | |
| " H -1.1805 -2.5207 0.0000\n", | |
| "'''\n", | |
| "mol = gto.M(atom=naphthalene, basis='cc-pvtz', max_memory=30000)\n", | |
| "mf = scf.RHF(mol).density_fit(auxbasis='cc-pvtz-ri')\n", | |
| "mf.kernel()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "id": "67ebbe08-8bdc-434c-b046-2a37de0cf7f2", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "cderi = mf.with_df._cderi\n", | |
| "nmo = mol.nao\n", | |
| "Lpq = ao2mo._ao2mo.nr_e2(cderi, mf.mo_coeff, (0, nmo, 0, nmo), aosym='s2', mosym='s1')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "id": "6b1a7c94-59eb-4379-896e-4b1e48049fc2", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "naux, _ = Lpq.shape\n", | |
| "Lpq = Lpq.reshape(naux, nmo, nmo)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "id": "a779a4af-7f1d-4dd7-ba52-042089e111be", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "(1050, 412, 412)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(Lpq.shape)\n", | |
| "nocc = mol.nelectron // 2\n", | |
| "nvir = nmo - nocc\n", | |
| "Lia = np.ascontiguousarray(Lpq[:, :nocc, nocc:])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "770b58ad-f3ca-4fb6-a1c6-7e3aa02e036e", | |
| "metadata": {}, | |
| "source": [ | |
| "## Baseline: the current `get_rho_response` function takes ~330 ms." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "id": "0c6ac088-6dff-4ee9-a1a3-90164446ad58", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "334 ms ± 975 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%timeit -n 10 get_rho_response(1.0, mf.mo_energy, Lia)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "id": "42a17863-5154-436e-bfb7-40f6f312dcaa", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "eia = mf.mo_energy[:nocc,None] - mf.mo_energy[None,nocc:]\n", | |
| "eia = eia/(1.0**2+eia*eia)\n", | |
| "eia = eia.copy()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "4d6f3caa-70f3-44f6-8f8f-c29e96313c96", | |
| "metadata": {}, | |
| "source": [ | |
| "## Of the 330 ms total time, 45 ms is entrywise multiplication" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "id": "907a9291-c268-4e60-87fd-aec4b6965d94", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "47.4 ms ± 98.6 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%timeit Pia = lib.einsum('Pia,ia->Pia',Lia,eia)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "id": "61622135-28f9-43dd-bf3f-30a12a58f1d4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import ctypes\n", | |
| "def faster_scale(eia, Lia):\n", | |
| " fn = lib.numpy_helper._np_helper.NPomp_dmul\n", | |
| " naux, nocc, nvir = Lia.shape\n", | |
| " Pia = np.empty((naux, nocc*nvir))\n", | |
| " eia_ravel = eia.reshape(-1)\n", | |
| " # broadcasting is accomplished\n", | |
| " # by setting eia_ravel's leading dimension to 0\n", | |
| " fn(ctypes.c_size_t(naux),\n", | |
| " ctypes.c_size_t(nocc*nvir),\n", | |
| " eia_ravel.ctypes.data_as(ctypes.c_void_p),\n", | |
| " ctypes.c_size_t(0),\n", | |
| " Lia.ctypes.data_as(ctypes.c_void_p),\n", | |
| " ctypes.c_size_t(nocc*nvir),\n", | |
| " Pia.ctypes.data_as(ctypes.c_void_p),\n", | |
| " ctypes.c_size_t(nocc*nvir))\n", | |
| " return Pia.reshape(naux, nocc, nvir)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "1ddca3a1-f642-4308-93b2-bd976aad3f33", | |
| "metadata": {}, | |
| "source": [ | |
| "## With multithreading, only 5 ms is required for the multiplication step" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 52, | |
| "id": "088dd8b5-dad3-446a-9c44-599e1fbc2dd4", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "5.09 ms ± 136 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%timeit pia = faster_scale(eia, Lia)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "e1da6463-12d0-4359-90ed-3a24f2b074a4", | |
| "metadata": {}, | |
| "source": [ | |
| "## Let's replace the einsum with dgemm and use the fast entrywise multiplication routine from above:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "id": "e84ecc3a-8aa5-44f3-82c2-7d952962038d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def get_rho_response_fast(omega, mo_energy, Lia):\n", | |
| " '''\n", | |
| " Compute density response function in auxiliary basis at freq iw\n", | |
| " '''\n", | |
| " naux, nocc, nvir = Lia.shape\n", | |
| " eia = mo_energy[:nocc,None] - mo_energy[None,nocc:]\n", | |
| " eia = eia/(omega**2+eia*eia)\n", | |
| " Pia = faster_scale(eia, Lia)\n", | |
| " # Response from both spin-up and spin-down density\n", | |
| " Pi = blas.dgemm(4.0, Lia.reshape(naux, -1).T, Pia.reshape(naux, -1).T, trans_a=1, trans_b=0).T\n", | |
| " return Pi" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ef6320e8-7d8c-48fb-a42a-ab34e457044d", | |
| "metadata": {}, | |
| "source": [ | |
| "### Correctness test" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "id": "5eeb6b44-d66f-455f-9d07-946ec8736944", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "np.float64(9.617575416258839e-16)" | |
| ] | |
| }, | |
| "execution_count": 49, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "np.linalg.norm(get_rho_response(1.0, mf.mo_energy, Lia) - get_rho_response_fast(1.0, mf.mo_energy, Lia))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ce7e4a94-7342-4128-b39f-491a409fb66a", | |
| "metadata": {}, | |
| "source": [ | |
| "## Benchmark\n", | |
| "### Time is reduced from 329 ms to 26.1 ms (!!)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "id": "e714dca4-21f7-450f-b953-87f3b19fdfbd", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "26.1 ms ± 5.08 ms per loop (mean ± std. dev. of 7 runs, 30 loops each)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%timeit -n 30 get_rho_response_fast(1.0, mf.mo_energy, Lia)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "4bbec0f4-ab7c-4f0a-970a-0f831a4a16d9", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment