Last active
September 27, 2023 17:40
-
-
Save ShrineFox/c74ff32481ce6c791d2c6d1e8c0efc01 to your computer and use it in GitHub Desktop.
Scrape website content within a certain div as plain text, ignoring unwanted divs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using HtmlAgilityPack; | |
| using System; | |
| using System.Collections.Generic; | |
| using System.IO; | |
| using System.Linq; | |
| using System.Net; | |
| using System.Net.Http; | |
| using System.Security.Policy; | |
| using System.Text; | |
| using System.Threading.Tasks; | |
| using System.Xml; | |
| using OpenQA.Selenium; | |
| using OpenQA.Selenium.Chrome; | |
| using OpenQA.Selenium.Support.UI; | |
| namespace EvflDl | |
| { | |
| internal class Program | |
| { | |
| static void Main(string[] args) | |
| { | |
| foreach (var line in File.ReadAllLines(args[0]).Where(x => !File.Exists(x))) | |
| { | |
| string url = @"https://acnh.isomorphicbox.com/evfl/2.0.4/" + line; | |
| string content = ""; | |
| Console.WriteLine(url); | |
| try | |
| { | |
| // Initialize a Chrome WebDriver | |
| using (IWebDriver driver = new ChromeDriver()) | |
| { | |
| // Navigate to the URL | |
| driver.Navigate().GoToUrl(url); | |
| // Create a WebDriverWait instance with a timeout (e.g., 5 seconds) | |
| WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(20)); | |
| // Use JavaScriptExecutor to wait for all JavaScript to finish executing | |
| wait.Until(x => | |
| { | |
| bool isJavaScriptComplete = (bool)((IJavaScriptExecutor)x).ExecuteScript("return (document.readyState === 'complete' && jQuery.active === 0);"); | |
| if (isJavaScriptComplete) | |
| { | |
| return true; | |
| } | |
| return false; | |
| }); | |
| // Find all li elements under the specified XPath | |
| IReadOnlyCollection<IWebElement> liElements = driver.FindElements(By.XPath( | |
| "/html/body/div[@id='content']/div[@id='viewer']/div[@id='code-wrapper']" + | |
| "/pre[@id='code']/ol[@class='linenums']/li")); | |
| foreach (var liElement in liElements) | |
| { | |
| // Get all descendants of the li element | |
| var descendants = liElement.FindElements(By.XPath(".//*")); | |
| // Filter out descendants within div elements with class name "tooltip" | |
| var validDescendants = new List<IWebElement>(); | |
| foreach (var descendant in descendants) | |
| { | |
| var tooltipAncestors = descendant.FindElements(By.XPath("ancestor::*[contains(@class, 'tooltip')]")); | |
| if (tooltipAncestors.Count == 0) | |
| { | |
| validDescendants.Add(descendant); | |
| } | |
| } | |
| // Concatenate and print the text of all valid descendants | |
| string liText = string.Join("", validDescendants.Select(e => e.Text).ToArray()); | |
| // Print the raw text of each li element | |
| content += liText + "\r\n"; | |
| } | |
| Console.WriteLine(content); | |
| File.WriteAllText(line, content); | |
| } | |
| } | |
| catch { } | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment