Greasy Fork is available in English.

MagicScraper

Scrapes and displays data from the web page based on rules.

此脚本不应直接安装,它是一个供其他脚本使用的外部库。如果您需要使用该库,请在脚本元属性加入:// @require https://update.greasyfork.org/scripts/471264/1222840/MagicScraper.js

// ==UserScript==
// @name         MagicScraper
// @namespace    http://tampermonkey.net/
// @version      0.1
// @description  Scrapes and displays data from the web page based on rules.
// @author       aolko
// @match        *://*/*
// @grant        GM_addStyle
// ==/UserScript==
function magicScraper(rules, options = {}) {
let scrapedData = {}; // Variable to store the scraped data
function createDOMFromScrapedData(data, element, keepChildren) {
for (const key in data) {
if (Array.isArray(data[key])) {
data[key].forEach(item => {
const newElement = document.createElement(element.tagName);
newElement.innerHTML = item;
if (keepChildren && element.children.length > 0) {
Array.from(element.children).forEach(child => {
newElement.appendChild(child);
});
}
element.appendChild(newElement);
});
} else if (typeof data[key] === 'string') {
const newElement = document.createElement(element.tagName);
newElement.innerHTML = data[key];
if (keepChildren && element.children.length > 0) {
Array.from(element.children).forEach(child => {
newElement.appendChild(child);
});
}
element.appendChild(newElement);
} else if (typeof data[key] === 'object') {
const newElement = document.createElement(element.tagName);
element.appendChild(newElement);
createDOMFromScrapedData(data[key], newElement, keepChildren);
}
}
}
function matchPageOrDomain(pattern, current) {
const escapedPattern = pattern.replace(/\./g, '\\.');
const regex = new RegExp(`^${escapedPattern.replace('*', '.*')}$`, 'i');
return regex.test(current);
}
function scrapeDataByRules(rulesObj, currentDomain, currentPage) {
const domainKeys = Object.keys(rulesObj);
let domainData = {};
let pageData = {};
for (const domainPattern of domainKeys) {
if (matchPageOrDomain(domainPattern, currentDomain)) {
domainData = rulesObj[domainPattern];
break;
}
}
if (currentPage && domainData.pages) {
pageData = domainData.pages[currentPage] || {};
}
return Object.assign({}, domainData, pageData);
}
function loadExternalRules(externalRulesURL, currentDomain, currentPage, callback) {
fetch(externalRulesURL)
.then(response => response.json())
.then(data => {
const rulesObj = data.rules || {};
const scrapedData = scrapeDataByRules(rulesObj, currentDomain, currentPage);
callback(scrapedData);
})
.catch(err => {
console.error('Error loading external rules:', err);
callback({});
});
}
function runScraping() {
const currentDomain = window.location.hostname;
const currentPage = window.location.pathname;
let pageRules;
if (typeof rules === 'string') {
// Load external rules if the rules parameter is a URL string
loadExternalRules(rules, currentDomain, currentPage, scrapedData => {
pageRules = scrapedData;
handleRules(pageRules);
});
} else {
pageRules = scrapeDataByRules(rules, currentDomain, currentPage);
handleRules(pageRules);
}
}
function handleRules(pageRules) {
if (Object.keys(pageRules).length === 0) {
console.warn('No rules found for the current domain and page.');
return;
}
const fragment = document.createDocumentFragment();
const temporaryData = {};
// Helper function to process nested rules
function processNestedRules(rules, currentElement) {
for (const key in rules) {
const selector = rules[key];
if (typeof selector === 'string') {
// Handle single selector
const elements = currentElement.querySelectorAll(selector);
temporaryData[key] = Array.from(elements).map(element => {
return {
text: element.textContent,
html: element.innerHTML,
};
});
} else if (typeof selector === 'object') {
// Handle nested rules recursively
temporaryData[key] = [];
const nestedElements = currentElement.querySelectorAll(key);
nestedElements.forEach(nestedElement => {
temporaryData[key].push({});
processNestedRules(selector, nestedElement);
});
}
}
}
// Process the top-level rules
processNestedRules(pageRules, document);
createDOMFromScrapedData(temporaryData, fragment, options.keepChildren);
if (options.replaceBody) {
document.body.innerHTML = '';
document.body.appendChild(fragment);
}
// Store the scraped data in the variable
scrapedData = temporaryData; // Update the correct variable with the scraped data
}
runScraping();
// Expose the scraped data object for further use
return scrapedData;
}