Skip to content

Instantly share code, notes, and snippets.

@ShrineFox
Last active September 27, 2023 17:40
Show Gist options
  • Select an option

  • Save ShrineFox/c74ff32481ce6c791d2c6d1e8c0efc01 to your computer and use it in GitHub Desktop.

Select an option

Save ShrineFox/c74ff32481ce6c791d2c6d1e8c0efc01 to your computer and use it in GitHub Desktop.
Scrape website content within a certain div as plain text, ignoring unwanted divs
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Security.Policy;
using System.Text;
using System.Threading.Tasks;
using System.Xml;
using OpenQA.Selenium;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.Support.UI;
namespace EvflDl
{
internal class Program
{
static void Main(string[] args)
{
foreach (var line in File.ReadAllLines(args[0]).Where(x => !File.Exists(x)))
{
string url = @"https://acnh.isomorphicbox.com/evfl/2.0.4/" + line;
string content = "";
Console.WriteLine(url);
try
{
// Initialize a Chrome WebDriver
using (IWebDriver driver = new ChromeDriver())
{
// Navigate to the URL
driver.Navigate().GoToUrl(url);
// Create a WebDriverWait instance with a timeout (e.g., 5 seconds)
WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(20));
// Use JavaScriptExecutor to wait for all JavaScript to finish executing
wait.Until(x =>
{
bool isJavaScriptComplete = (bool)((IJavaScriptExecutor)x).ExecuteScript("return (document.readyState === 'complete' && jQuery.active === 0);");
if (isJavaScriptComplete)
{
return true;
}
return false;
});
// Find all li elements under the specified XPath
IReadOnlyCollection<IWebElement> liElements = driver.FindElements(By.XPath(
"/html/body/div[@id='content']/div[@id='viewer']/div[@id='code-wrapper']" +
"/pre[@id='code']/ol[@class='linenums']/li"));
foreach (var liElement in liElements)
{
// Get all descendants of the li element
var descendants = liElement.FindElements(By.XPath(".//*"));
// Filter out descendants within div elements with class name "tooltip"
var validDescendants = new List<IWebElement>();
foreach (var descendant in descendants)
{
var tooltipAncestors = descendant.FindElements(By.XPath("ancestor::*[contains(@class, 'tooltip')]"));
if (tooltipAncestors.Count == 0)
{
validDescendants.Add(descendant);
}
}
// Concatenate and print the text of all valid descendants
string liText = string.Join("", validDescendants.Select(e => e.Text).ToArray());
// Print the raw text of each li element
content += liText + "\r\n";
}
Console.WriteLine(content);
File.WriteAllText(line, content);
}
}
catch { }
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment