Last active
March 9, 2024 17:06
-
-
Save mrocklin/f7c1eeb3895a6798b233cd0e3de335ff to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "36e7f837-d110-4c3e-b0f8-ee5c0bae6cc4", | |
| "metadata": {}, | |
| "source": [ | |
| "# arXiv + RAG\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3c66080b-fa4f-4a7c-9671-459bb30dfc23", | |
| "metadata": {}, | |
| "source": [ | |
| "## Extract documents" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "be8308a8-48ce-46b4-a6d2-9c7f445f8ae8", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['arxiv/pdf/arXiv_pdf_0001_001.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0001_002.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0002_001.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0002_002.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0003_001.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0003_002.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0004_001.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0004_002.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0005_001.tar',\n", | |
| " 'arxiv/pdf/arXiv_pdf_0005_002.tar']" | |
| ] | |
| }, | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import s3fs\n", | |
| "s3 = s3fs.S3FileSystem(requester_pays=True)\n", | |
| "\n", | |
| "directories = s3.ls(\"s3://arxiv/pdf\")\n", | |
| "directories[:10]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "75229f27-3e76-4954-a2ac-816482d3e29a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import tarfile\n", | |
| "import io\n", | |
| "import fitz\n", | |
| "\n", | |
| "def extract(filename: str):\n", | |
| " \"\"\" Extract and process one directory of arXiv data\n", | |
| " \n", | |
| " Yields\n", | |
| " ------\n", | |
| " dictionary with name and text pairs\n", | |
| " \"\"\"\n", | |
| " with s3.open(filename) as f:\n", | |
| " bytes = f.read()\n", | |
| " with io.BytesIO() as bio:\n", | |
| " bio.write(bytes)\n", | |
| " bio.seek(0)\n", | |
| " with tarfile.TarFile(fileobj=bio) as tf:\n", | |
| " for member in tf.getmembers():\n", | |
| " if member.isfile() and member.name.endswith(\".pdf\"):\n", | |
| " data = tf.extractfile(member).read()\n", | |
| " if not data:\n", | |
| " print(\"empty document\", member.name)\n", | |
| " continue\n", | |
| " \n", | |
| " with fitz.Document(\n", | |
| " stream=tf.extractfile(member).read()\n", | |
| " ) as pdf:\n", | |
| " # TODO: think about smaller chunks / overlapping, etc..\n", | |
| " for page in pdf.pages(): \n", | |
| " yield {\n", | |
| " \"name\": member.name,\n", | |
| " \"text\": page.get_text(),\n", | |
| " }" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "d51170fb-6c07-4996-83ab-1b5594d689cd", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 1min 9s, sys: 14.2 s, total: 1min 23s\n", | |
| "Wall time: 2min 14s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "out = extract(directories[-9])\n", | |
| "out = list(out)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "df1cb7da-0f0d-4391-ba18-c565e3e01f02", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 57.7 ms, sys: 23.8 ms, total: 81.5 ms\n", | |
| "Wall time: 82.8 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>name</th>\n", | |
| " <th>text</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>9909/astro-ph9909120.pdf</td>\n", | |
| " <td>arXiv:astro-ph/9909120v1 7 Sep 1999\n", | |
| "Mon. Not....</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>9909/astro-ph9909120.pdf</td>\n", | |
| " <td>2\n", | |
| "S. Boissier and N. Prantzos\n", | |
| "This latter poin...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>9909/astro-ph9909120.pdf</td>\n", | |
| " <td>Chemo-spectrophotometric evolution of spiral g...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>9909/astro-ph9909120.pdf</td>\n", | |
| " <td>4\n", | |
| "S. Boissier and N. Prantzos\n", | |
| "Figure 1.\n", | |
| "Main p...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>9909/astro-ph9909120.pdf</td>\n", | |
| " <td>Chemo-spectrophotometric evolution of spiral g...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " name text\n", | |
| "0 9909/astro-ph9909120.pdf arXiv:astro-ph/9909120v1 7 Sep 1999\n", | |
| "Mon. Not....\n", | |
| "1 9909/astro-ph9909120.pdf 2\n", | |
| "S. Boissier and N. Prantzos\n", | |
| "This latter poin...\n", | |
| "2 9909/astro-ph9909120.pdf Chemo-spectrophotometric evolution of spiral g...\n", | |
| "3 9909/astro-ph9909120.pdf 4\n", | |
| "S. Boissier and N. Prantzos\n", | |
| "Figure 1.\n", | |
| "Main p...\n", | |
| "4 9909/astro-ph9909120.pdf Chemo-spectrophotometric evolution of spiral g..." | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "df = pd.DataFrame(out, dtype=\"string[pyarrow]\")\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "0c2651e1-99ef-439a-9ffd-42ee86421f38", | |
| "metadata": {}, | |
| "source": [ | |
| "## Perform embeddings" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "c2e78c56-7537-427d-a200-44a6d2fb95e2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/Users/mrocklin/mambaforge/envs/rag/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n", | |
| " return self.fget.__get__(instance, owner)()\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 1.27 s, sys: 526 ms, total: 1.8 s\n", | |
| "Wall time: 3.66 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "from sentence_transformers import SentenceTransformer\n", | |
| "model = SentenceTransformer(\"all-MiniLM-L6-v2\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "58836576-2c01-4189-99bd-74d905a82ff5", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 54.8 s, sys: 5.93 s, total: 1min\n", | |
| "Wall time: 2min 12s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "embeddings = model.encode(df.text)\n", | |
| "df[\"vector\"] = list(embeddings)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "8a30d9b2-d953-4d7b-96b3-ff92d9cedf60", | |
| "metadata": {}, | |
| "source": [ | |
| "## Create Vector Database locally with LanceDB" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "0e075947-c6eb-4521-a651-bb49c8082e4d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os,shutil\n", | |
| "if os.path.exists(\"local.db\"):\n", | |
| " shutil.rmtree(\"local.db\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "d9b1f3f1-d6d2-42a0-a4d0-c81de073f086", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import lancedb\n", | |
| "db = lancedb.connect(\n", | |
| " \"local.db\", \n", | |
| ")\n", | |
| "tbl = db.create_table(\"arxiv\", data=df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "b4ad4e2a-0c2a-4013-9524-1176f2e09722", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "arXiv:astro-ph/9909498v1 29 Sep 1999\n", | |
| "INNER REGION ACCRETION FLOWS ONTO BLACK HOLES\n", | |
| "MENAS KAFATOS AND PRASAD SUBRAMANIAN\n", | |
| "Center for Earth Observing and Space Research\n", | |
| "George Mason University\n", | |
| "Fairfax, VA 22030, U.S.A.\n", | |
| "1.\n", | |
| "Abstract\n", | |
| "We examine here the inner region accretion flows onto black holes. A variety\n", | |
| "of models are presented. We also discuss viscosity mechanisms under a\n", | |
| "variety of circumstances, for standard accretion disks onto galactic black\n", | |
| "holes and supermassive black holes and hot accretion disks. Relevant work\n", | |
| "is presented here on unified aspects of disk accretion onto supermassive\n", | |
| "black holes and the possible coupling of thick disks to beams in the inner\n", | |
| "regions. We also explore other accretion flow scenarios. We conclude that\n", | |
| "a variety of scenarios yield high temperatures in the inner flows and that\n", | |
| "viscosity is likely not higher than alpha ∼ 0.01.\n", | |
| "2.\n", | |
| "Introduction\n", | |
| "“Accretion is recognized as a phenomenon of fundamental importance in\n", | |
| "Astrophysics” (Frank, King and Raine, 1992). This is indeed the case as\n", | |
| "gravitational energy released in accretion processes is believed to be the\n", | |
| "dominant source of energy in a variety of high energy galactic compact\n", | |
| "sources in binary star systems containing white dwarfs, neutron stars and\n", | |
| "black holes; as well as extragalactic, supermassive black holes (Shapiro and\n", | |
| "Teukolsky, 1983). Both spherical/quasi-spherical accretion (Bondi, 1952;\n", | |
| "Frank et al., 1992 and references therein; Treves, Maraschi and Abramow-\n", | |
| "icz, 1989 and papers therein); and disk accretion (Pringle, 1981; Dermott,\n", | |
| "Hunter, and Wilson, 1992; Frank et al., 1992 and references therein; Treves\n", | |
| "et al., 1989 and papers therein) operate and the type of accretion may\n", | |
| "depend on boundary conditions such as the motion of gas at infinity, its\n", | |
| "angular momentum per unit mass, etc. The field of accretion astrophysics is\n", | |
| "obviously vast. In this paper we concentrate on the inner regions of accretion\n", | |
| "\n", | |
| "--------------------------------------------------------------------------------\n", | |
| "its constituent masses; so energy conservation is not sufficient to stabilize the black\n", | |
| "holes against spontaneous fragmentation.\n", | |
| "The black hole is thus stable in the extremal limit; so it is reasonable to describe\n", | |
| "it as the ground state in a conformal field theory. Moving away from extremality, the\n", | |
| "black holes exhibit thermal properties, as expected. These are interpreted in terms of\n", | |
| "perturbations of the conformal field theory and should remain under control, as long\n", | |
| "as they are small. The prospects of a precise microscopic description of the black\n", | |
| "holes, even though they are not supersymmetric, is the ultimate goal of the inquiry.\n", | |
| "However, this will be pursued elsewhere; the discussion in this paper focusses on the\n", | |
| "classical properties of the black holes.\n", | |
| "The present investigation is motivated by several relations to string theory. For ex-\n", | |
| "ample, add six additional toroidally compactified dimensions, and interpret the com-\n", | |
| "pact Kaluza-Klein direction as the M-theory circle. Then the solutions are charged\n", | |
| "with respect to the “electric” charge of D0-branes and the “magnetic” charge of D6-\n", | |
| "branes, fully wrapping the six inert dimensions. Thus the solutions can be interpreted\n", | |
| "as rotating bound states of D0- and D6-branes. As discussed above, angular momen-\n", | |
| "tum conservation implies that such bound states are stable, even though they are not\n", | |
| "supersymmetric. The argument may explain the stability to the leading order noted\n", | |
| "in the string theory description of the D0 − D6 system [1].\n", | |
| "For another application, recall that general four dimensional black holes in N =\n", | |
| "4, 8 string theory are generated by black holes depending on 5 parametric charges [2].\n", | |
| "By now it is standard to consider four of these charges [3, 4], but the fifth charge is\n", | |
| "difficult and our understanding is incomplete, even at the level of classical solutions;\n", | |
| "for discussion see e.g [5]. The problematic fifth charge parametrizes the inner product\n", | |
| "of the electric and magnetic charge vectors; when it is absent, the four charges refer\n", | |
| "to independent U(1)’s, up to duality. In the present work there is only one U(1) field,\n", | |
| "so electric and magnetic charges are necessarilly parallel. In a sense, there is only the\n", | |
| "fifth charge, and therefore good opportunity to study its properties.\n", | |
| "The paper is organized as follows. In section 2, we review the solution generating\n", | |
| "technique, following Sen [6]. The resulting solution is presented in its five-dimensional\n", | |
| "form, along with necessary notation. There is also a discussion of various special\n", | |
| "cases, and the relation to previously known solutions. Subsequently, in section 3,\n", | |
| "2\n", | |
| "\n", | |
| "--------------------------------------------------------------------------------\n", | |
| "the initial state is characterized by excitations with size of order the thermal wavelength,\n", | |
| "so causality suggests that the relaxation timescale should also be of order the thermal\n", | |
| "wavelength.\n", | |
| "The results we obtain are consistent with this expectation. A black hole in AdS is\n", | |
| "determined by two dimensionful parameters, the AdS radius R and the black hole radius\n", | |
| "r+. The quasinormal frequencies must be functions of these parameters. For large black\n", | |
| "holes, r+ ≫ R, we will show that there is an additional symmetry which insures that\n", | |
| "the frequencies can depend only on the black hole temperature T ∼ r+/R2. However,\n", | |
| "for smaller black holes, this is no longer the case. Whereas the temperature begins to\n", | |
| "increase as one decreases r+ below R, we find that the (imaginary part of the) frequency\n", | |
| "continues to decrease with r+. This is different from what happens for asymptotically flat\n", | |
| "black holes. An ordinary Schwarzschild black hole has only one dimensionful parameter\n", | |
| "which can be taken to be the temperature. Its quasinormal frequencies must therefore be\n", | |
| "multiples of this temperature. Thus small black holes in AdS do NOT behave like black\n", | |
| "holes in asymptotically flat spacetime. The reason is simply that the boundary conditions\n", | |
| "at infinity are changed. More physically, the late time behavior of the field is affected by\n", | |
| "waves bouncing off the potential at large r.\n", | |
| "Another difference from the asymptotically flat case concerns the decay at very late\n", | |
| "times. For a Schwarzschild black hole, it is known that the exponential decay associated\n", | |
| "with the quasinormal modes eventually gives way to a power law tail [11]. This has been\n", | |
| "shown to be associated with the scattering of the field off the Coulomb potential at large\n", | |
| "r. As we will discuss later, for asymptotically AdS black holes, this does not occur.\n", | |
| "We will compute the quasinormal frequencies for Schwarzschild-AdS black holes in\n", | |
| "the dimensions of interest for the AdS/CFT correspondence: four, five, and seven. We will\n", | |
| "consider minimally coupled scalar perturbations representing, e.g., the dilaton. This cor-\n", | |
| "responds to a particular perturbation of the CFT. For example, for AdS5, it corresponds\n", | |
| "to a perturbation of an (approximately) thermal state in super Yang-Mills on S3 × R with\n", | |
| "< F 2 > nonzero. In the linearized approximation we are using, the spacetime metric is\n", | |
| "not affected by the scalar field. So the perturbation of the thermal state does not change\n", | |
| "the energy density, which remains uniform over the sphere. The late time decay of this\n", | |
| "perturbation is universal in the sense that all solutions for the dilaton with the same angu-\n", | |
| "lar dependence will decay at the same rate, which is determined by the imaginary part of\n", | |
| "the lowest quasinormal frequency. Different perturbations, corresponding to different lin-\n", | |
| "2\n", | |
| "\n", | |
| "--------------------------------------------------------------------------------\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "results = tbl.search(model.encode(\"black holes are cold\")).limit(3).to_pandas().text.tolist()\n", | |
| "for result in results:\n", | |
| " print(result)\n", | |
| " print(\"-\" * 80)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "c72120d2-c1e7-43b2-9793-25b8b61b55d6", | |
| "metadata": {}, | |
| "source": [ | |
| "## Create Vector Database in S3" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "24e2dbac-cb6a-46f3-8b30-98af79260050", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[]" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import s3fs\n", | |
| "\n", | |
| "s3 = s3fs.S3FileSystem()\n", | |
| "s3.rm(\"s3://openscapes-scratch/mrocklin/lance\", recursive=True)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "b166f8a8-ea3c-43b5-bf02-b58bc70e2d39", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# It's dumb that this is necessary right now. \n", | |
| "\n", | |
| "import os\n", | |
| "os.environ[\"AWS_DEFAULT_REGION\"] = \"us-west-2\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "b4e8754f-2efb-4926-8541-27cea36dd307", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[2024-03-09T17:05:18Z WARN lance_table::io::commit] Using unsafe commit handler. Concurrent writes may result in data loss. Consider providing a commit handler that prevents conflicting writes.\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 1.65 s, sys: 507 ms, total: 2.15 s\n", | |
| "Wall time: 27.4 s\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "\n", | |
| "import lancedb\n", | |
| "db = lancedb.connect(\n", | |
| " \"s3://openscapes-scratch/mrocklin/lance\", \n", | |
| " region=\"us-west-2\",\n", | |
| ")\n", | |
| "tbl = db.create_table(\"arxiv\", data=df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "e85eca47-3b4d-4846-9aff-b3fe1de7f0e7", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "arXiv:astro-ph/9909087v1 4 Sep 1999\n", | |
| "Mon. Not. R. Astron. Soc. 000, 000–000 (0000)\n", | |
| "Printed 12 November 2018\n", | |
| "(MN LATEX style file v1.4)\n", | |
| "Constraints on structure formation models from the\n", | |
| "Sunyaev-Zel′dovich Effect\n", | |
| "Subhabrata Majumdar⋆1,2, Ravi Subrahmanyan†3,4\n", | |
| "1 Joint Astrophysics Programme, Department of Physics, Indian Institute of Science, Bangalore 560012, India\n", | |
| "2 Indian Institute of Astrophysics, Koramangala, Bangalore 560034, India\n", | |
| "3 Australia Telescope National Facility, CSIRO, Locked bag 194, Narrabri, NSW 2390, Australia\n", | |
| "4 Raman Research Institute, Sadashivanagar, Bangalore 560080, India\n", | |
| "12 November 2018\n", | |
| "ABSTRACT\n", | |
| "In the context of cold dark matter (CDM) cosmological models, we have simulated im-\n", | |
| "ages of the brightness temperature fluctuations in the cosmic microwave background\n", | |
| "(CMB) sky owing to the Sunyaev Zel′dovich (S-Z) effect in a cosmological distribution\n", | |
| "of clusters. We compare the image statistics with recent ATCA limits on arcmin-scale\n", | |
| "CMB anisotropy. The S-Z effect produces a generically non-Gaussian field and we\n", | |
| "compute the variance in the simulated temperature-anisotropy images, after convolu-\n", | |
| "tion with the ATCA beam pattern, for different cosmological models. All the models\n", | |
| "are normalised to the 4-year COBE data. We find an increase in the simulated-sky\n", | |
| "temperature variance with increase in the cosmological density parameter Ω0. A com-\n", | |
| "parison with the upper limits on the sky variance set by the ATCA appears to rule\n", | |
| "out our closed-universe model: low-Ω0 open-universe models are preferred. The result\n", | |
| "is independent of any present day observations of σ8.\n", | |
| "Key words: cosmic microwave background — galaxies: clusters — cosmology: theory\n", | |
| "— cosmology: observations\n", | |
| "1\n", | |
| "INTRODUCTION\n", | |
| "A key problem in modern cosmology is the determination\n", | |
| "of the geometry of the universe. Classical methods of de-\n", | |
| "riving the cosmological parameters involve measurements of\n", | |
| "the redshift dependence of apparent luminosities of ‘stan-\n", | |
| "dard candles’, or the angular sizes of ‘standard rulers’, or\n", | |
| "the number densities of non-evolving objects. These meth-\n", | |
| "ods are expected to probe the background geometry inde-\n", | |
| "pendent of any structure formation. Many of these classical\n", | |
| "approaches have been limited by the difficulties in identi-\n", | |
| "fying objects, distributed over cosmic timescales, that are\n", | |
| "untouched by astrophysical evolution; however, it may be\n", | |
| "noted that progress has recently been reported using super-\n", | |
| "novae Type 1a as standard candles (Perlmutter et al. 1998).\n", | |
| "The formation of structure is dependent on the back-\n", | |
| "ground cosmology. Attempts have been made to exploit\n", | |
| "this coupling by examining the parameter space allowed\n", | |
| "for the cosmological constants by favoured models of struc-\n", | |
| "ture formation. Motivated by the discovery of anisotropy in\n", | |
| "the cosmic microwave background (CMB), progress in the\n", | |
| "⋆ E-mail : [email protected]\n", | |
| "†\n", | |
| "E-mail : [email protected]\n", | |
| "understanding of physical mechanisms responsible for the\n", | |
| "anisotropy spectrum and the influence of the background\n", | |
| "cosmology in the generation of these anisotropies have led\n", | |
| "to attempts at deriving constraints on the cosmological pa-\n", | |
| "rameters from the shape of the CMB anisotropy spectrum\n", | |
| "(Bond et al. 1994). Anisotropies in the CMB are usually de-\n", | |
| "scribed by its power spectrum: Cl = ⟨| Clm |2⟩ represents\n", | |
| "the anisotropy power at multipole order l, where Clm are\n", | |
| "the coefficients of the spherical harmonic decomposition of\n", | |
| "the fractional temperature fluctuations. The density fluctu-\n", | |
| "ations that are hypothesized to have been generated in the\n", | |
| "very early universe are believed to grow via gravitational\n", | |
| "instabilities and give rise to the large scale structures we see\n", | |
| "in the present day universe. The CMB temperature fluctu-\n", | |
| "ations seen today on large angular scales exceeding about\n", | |
| "a degree are believed to be a direct consequence of mat-\n", | |
| "ter inhomogeneities on scales exceeding ≈ 100 Mpc at the\n", | |
| "recombination epoch. The gravitational and astrophysical\n", | |
| "evolution, in the post-recombination universe, which led to\n", | |
| "the formation of galaxies and their clustering, may have al-\n", | |
| "tered these primary radiation anisotropies and may have\n", | |
| "given rise to the dominant CMB fluctuations on small an-\n", | |
| "gular scales. Since the generation of the CMB anisotropies\n", | |
| "and their appearance on the sky is intimately linked with\n", | |
| "c⃝ 0000 RAS\n", | |
| "\n", | |
| "--------------------------------------------------------------------------------\n", | |
| "arXiv:astro-ph/9909301v1 17 Sep 1999\n", | |
| "CMBology\n", | |
| "Charles H. Lineweaver\n", | |
| "School of Physics, University of New South Wales\n", | |
| "Sydney, NSW, 2052 Australia\n", | |
| "Abstract.\n", | |
| "The details of the CMB power spectrum are being revealed\n", | |
| "through the combined efforts of the world’s CMBologists. The current\n", | |
| "data set constrains several cosmological parameters. A combination with\n", | |
| "other (non-lensing) constraints yields estimates of the cosmological con-\n", | |
| "stant: ΩΛ = 0.65 ± 0.13, the mass density: Ωm = 0.23 ± 0.08 and the age\n", | |
| "of the Universe: to = 13.4±1.6. Lensing data is not yet comfortable with\n", | |
| "these values.\n", | |
| "1.\n", | |
| "Seeing Sounds in the CMB (see Figure 1)\n", | |
| "As the Universe cools it goes from being radiation dominated to matter domi-\n", | |
| "nated. The boundary is labelled ‘zeq’ on the right side of Fig. 1. As the Universe\n", | |
| "cools further, electrons and protons combine, thereby decoupling from photons\n", | |
| "during the redshift interval ‘∆zdec’. The opaque universe becomes transparent.\n", | |
| "Present observers, on the left of Fig. 1, look back and see hot and cold spots on\n", | |
| "the surface of last scattering. But where did the hot and cold spots come from?\n", | |
| "At zeq, dark matter over-densities begin to collapse gravitationally.\n", | |
| "The\n", | |
| "photon-baryon fluid (grey) falls (inward pointing arrows) into the dark matter\n", | |
| "potential wells – gets compressed (dark grey) and then rebounds (outward point-\n", | |
| "ing arrows) due to photon pressure support – leaving less dense regions (white)\n", | |
| "at the bottoms of the wells, then recollapses and so on. In the observable inter-\n", | |
| "val ∆zdec, the phases at which we see these oscillations depend on their physical\n", | |
| "size. Four different sizes with four different phases are shown in Fig. 1. From\n", | |
| "top to bottom: maximum Doppler inward velocities, maximum adiabatic com-\n", | |
| "pression, maximum Doppler outward velocities, maximum adiabatic rarefaction.\n", | |
| "The corresponding power spectrum of the CMB, Cℓ, is shown on the left. Notice\n", | |
| "that the peaks in the total power spectrum are due to adiabatic compression and\n", | |
| "rarefaction, while the valleys are filled in by the relatively smaller Doppler peaks.\n", | |
| "Although we have used the example of dark matter over-densities, we are in the\n", | |
| "regime of small amplitude linear fluctuations and so dark matter under-densities\n", | |
| "produce the same power spectrum, i.e., the first and largest peak in the total\n", | |
| "spectrum is produced by equal numbers of hot and cold spots on the surface\n", | |
| "of last scattering. When we see hot and cold spots in the CMB we are seeing\n", | |
| "sound: acoustic adiabatic compressions and rarefactions, visible across 13 billion\n", | |
| "years of vacuum.\n", | |
| "1\n", | |
| "\n", | |
| "--------------------------------------------------------------------------------\n", | |
| "I. INTRODUCTION\n", | |
| "Some might say that the ultimate inverse problem is to understand the origin of structure\n", | |
| "in the Universe using information that is currently available: it is the central problem in\n", | |
| "early-Universe cosmology [27,18]. One of the available cosmological observables is the cosmic\n", | |
| "microwave background radiation (CMBR). It is known that the temperature of the CMBR is\n", | |
| "remarkably uniform in its spatial variation, with δT/T ≃ 10−5–10−4 [35], [18, §1.5], implying\n", | |
| "that the expansion of the Universe is largely isotropic. Conversely, inhomogeneities in the\n", | |
| "density of the Universe lead to temperature anisotropies, so that these can be used as a\n", | |
| "sensitive test of theories of structure formation [36].\n", | |
| "In these theories, the primary unknown function is P(k), the power spectrum of the\n", | |
| "primordial density fluctuations.\n", | |
| "Under certain assumptions, it can be shown that P(k)\n", | |
| "satisfies the following set of equations:\n", | |
| "Z ∞\n", | |
| "0\n", | |
| "k−2 P(k) j2\n", | |
| "ℓ (ky) dk = Cℓ,\n", | |
| "ℓ = 0, 1, 2, . . . .\n", | |
| "(1)\n", | |
| "Here, y is a given positive constant, jℓ is a spherical Bessel function and the constants Cℓ\n", | |
| "are given.\n", | |
| "Experiments have provided estimates for a finite number of the Cℓ’s. How can these\n", | |
| "be used to recover P(k)? This is a major question, but it is not our main concern here.\n", | |
| "We are interested, first, in an idealised problem: given an exact knowledge of all the Cℓ’s,\n", | |
| "reconstruct P. We show that this inverse problem can be solved exactly. This is a new\n", | |
| "result. Furthermore, we show that the same solution can be recovered if the first few (in\n", | |
| "fact, any finite number) of the Cℓ’s are unknown.\n", | |
| "What use is an exact inversion formula for an idealised problem? First, it reveals the\n", | |
| "ill-posed nature of the problem.1\n", | |
| "Thus, P(k) is found to be given by the Fourier sine\n", | |
| "transform of a certain function g0(λ), which is defined by an infinite series with a finite\n", | |
| "interval of convergence. This means that techniques of analytic continuation are required,\n", | |
| "a process that can be very difficult numerically [12] and one that gives the problem its ill-\n", | |
| "posed character. Second, our inversion formula reveals some of the analytic structure. For\n", | |
| "example, the low-k behaviour of P(k) is intimately related to the asymptotic behaviour of\n", | |
| "g0(λ) as λ → ∞. Third, exact results can be used to test numerical algorithms designed for\n", | |
| "the finite-data problem.\n", | |
| "The plan of the paper is as follows. In the next section, we sketch a derivation of the\n", | |
| "governing equations (1). Careful derivations can be found in the literature; references are\n", | |
| "given. Our aim is to motivate the study of (1) in a way that is accessible to non-specialists.\n", | |
| "Thus, we limit our discussion to perhaps the simplest model of the underlying physics.\n", | |
| "In section III, we formulate two moment-like problems, called the Basic Problem and the\n", | |
| "Reduced-data Problem in which we are given Cℓ for ℓ ≥ 0 and ℓ ≥ ℓ0, respectively, where ℓ0\n", | |
| "is any fixed positive integer. Apart from the system (1), we also consider (in section V) the\n", | |
| "analogous classical moment problem where j2\n", | |
| "ℓ (ky) is replaced by kℓ. For both cases, we apply\n", | |
| "1For the definition of an ill-posed problem, see, for example [8, ch 4].\n", | |
| "2\n", | |
| "\n", | |
| "--------------------------------------------------------------------------------\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "results = tbl.search(model.encode(\"the cosmic microwave background is cold\")).limit(3).to_pandas().text.tolist()\n", | |
| "for result in results:\n", | |
| " print(result)\n", | |
| " print(\"-\" * 80)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "534e5e2e-811e-4bf6-a09c-65c078a83ba2", | |
| "metadata": {}, | |
| "source": [ | |
| "## Wrap everything up into a single function" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "fefc2f88-76c4-4092-a378-183b8df60869", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def process(directory, lock=None):\n", | |
| " import os\n", | |
| " os.environ[\"AWS_DEFAULT_REGION\"] = \"us-west-2\"\n", | |
| "\n", | |
| " out = extract(directory)\n", | |
| " out = list(out)\n", | |
| " df = pd.DataFrame(out, dtype=\"string[pyarrow]\")\n", | |
| " \n", | |
| " model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", | |
| " embeddings = model.encode(df.text)\n", | |
| " df[\"vector\"] = list(embeddings)\n", | |
| " \n", | |
| " import lancedb\n", | |
| " db = lancedb.connect(\n", | |
| " \"s3://openscapes-scratch/mrocklin/lance\", \n", | |
| " region=\"us-west-2\",\n", | |
| " )\n", | |
| " tbl = db.open_table(\"arxiv\")\n", | |
| " with lock:\n", | |
| " tbl.add(df)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "7d2409a9-62f8-4a81-928e-8724821d02a4", | |
| "metadata": {}, | |
| "source": [ | |
| "## Run on cloud machines" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "631f523c-9e0d-4f8b-b62b-0f16f8826c58", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import coiled\n", | |
| "cluster = coiled.Cluster(\n", | |
| " n_workers=10,\n", | |
| " worker_vm_types=[\"m7i.xlarge\"],\n", | |
| " spot_policy=\"spot_with_fallback\",\n", | |
| " region=\"us-east-1\",\n", | |
| ")\n", | |
| "client = cluster.get_client()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "96010081-ae64-431b-a0a8-5cf34336cef4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from dask.distributed import Lock\n", | |
| "\n", | |
| "lock = Lock(\"lance-arxiv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "bd5ecc6e-8a6d-4a75-8c9a-5f3d633d3494", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "futures = client.map(process, directories[-100:-5], lock=lock)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "8a956ef7-c395-42b9-b03e-783e31c4e019", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%%time\n", | |
| "from dask.distributed import wait\n", | |
| "wait(futures)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "263eae05-e69e-4321-874d-2791ae61943b", | |
| "metadata": {}, | |
| "source": [ | |
| "## Kinda slow, try with GPUs" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "90bec3b8-29af-4f97-87aa-2150e0dab958", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import sys\n", | |
| "import coiled\n", | |
| "coiled.create_software_environment(\n", | |
| " name=\"arxiv-rag\",\n", | |
| " gpu_enabled=True,\n", | |
| " conda={\n", | |
| " \"channels\": [\"huggingface\", \"pytorch\", \"nvidia\", \"conda-forge\", \"defaults\",],\n", | |
| " \"dependencies\": [\"sentence-transformers\", \"pytorch-cuda\", \"dask\", \"pyarrow\", \"s3fs\", f\"python={sys.version.split()[0]}\"],\n", | |
| " },\n", | |
| " pip=[\"lancedb\", \"pymupdf\", \"pynvml\"],\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "320b43f9-5f0f-419a-8d33-ab0af53df0c3", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import coiled\n", | |
| "gpu_cluster = coiled.Cluster(\n", | |
| " n_workers=10,\n", | |
| " worker_gpu=True,\n", | |
| " spot_policy=\"spot_with_fallback\",\n", | |
| " region=\"us-east-1\",\n", | |
| " software=\"arxiv-rag\",\n", | |
| ")\n", | |
| "gpu_client = gpu_cluster.get_client()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "99539528-745b-46fe-94e0-765c48c8a5be", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from dask.distributed import Lock\n", | |
| "\n", | |
| "gpu_lock = Lock(\"lance-arxiv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "092fdd58-a69f-4d92-8842-914c032957bb", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "gpu_futures = gpu_client.map(process, directories[-100:-5], lock=gpu_lock)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "b5c26455-016f-4ec3-b652-868c143f3875", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%%time\n", | |
| "wait(gpu_futures)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python [conda env:rag]", | |
| "language": "python", | |
| "name": "conda-env-rag-py" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.8" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment