# Парсер новостей * Убирает рекламу * Выделяет заголовок, картинку, текст статьи * Выставляет картинку размером 600 пикселей * Текст выравнивает по ширине * Указывает источник * Для ЗВ готовит новость для КроссПостинга
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

478 lines
18 KiB

// ==UserScript==
// @name News parser
// @namespace http://zakonvremeni.ru
// @version 0.3.1
// @description Parse news
// @author AlexeiBv+mirocod@narod.ru
// @match https://tass.ru/*
// @match https://ria.ru/*
// @match https://rg.ru/*
// @match https://www.cnews.ru/*
// @match https://mixednews.ru/*
// @match https://russian.rt.com/*
// @match https://zakonvremeni.ru/*
// @icon https://icons.duckduckgo.com/ip2/zakonvremeni.ru.ico
// @grant none
// ==/UserScript==
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru>
(function() {
'use strict';
// Поиск элементов по регулярному выражению
function GetElementClassName(a_Element) {
return a_Element.className;
}
function GetNodeName(a_Element) {
return a_Element.nodeName;
}
function CheckRegExp(a_GetElementNameFunc, a_RegExpPattern, a_Element) {
let re = new RegExp("(?:^|\\s)" + a_RegExpPattern + "(?!\\S)");
return re.test(a_GetElementNameFunc(a_Element));
}
function FindElementsByRegExp(a_GetElementNameFunc, a_RegExpPattern, a_ElementParent) {
a_ElementParent || (a_ElementParent=document);
let descendants = a_ElementParent.getElementsByTagName('*'), i=-1, e, result=[];
while (e = descendants[++i]) {
if (CheckRegExp(a_GetElementNameFunc, a_RegExpPattern, e)){
result.push(e);
}
}
return result;
}
// Работа со строками
function TrimString(str) {
return ( str || '' ).replace( /^\s+|\s+$/g, '' );
}
function RemoveBeforeSplitter(a_String, a_Splitter) {
let index = a_String.indexOf(a_Splitter)
if (index != -1) {
return a_String.substring(index + a_Splitter.length);
}
return a_String;
}
function RemoveAfterSplitter(a_String, a_Splitter, a_SaveSplitter) {
let index = a_String.indexOf(a_Splitter)
if (index != -1) {
let spl_len = a_Splitter.length
if (!a_SaveSplitter) {
spl_len = 0;
}
return a_String.substring(0, index + spl_len);
}
return a_String;
}
function ClearUrl(a_Url) {
const separator = '?';
return RemoveAfterSplitter(a_Url, separator, false);
}
function ClearTextFuncTemplate(a_RemoveBeforeList, a_OnlyFirstIndex) {
function ClearTextFunc(a_Content, a_ElementIndex) {
if (a_OnlyFirstIndex && a_ElementIndex && a_ElementIndex != 1) {
return a_Content;
}
let content = a_Content;
for (let i = 0; i < a_RemoveBeforeList.length; i++) {
let r = a_RemoveBeforeList[i];
content = RemoveBeforeSplitter(content, r);
}
return content;
}
return ClearTextFunc
}
// Работа с контейнерами
function GetImageInContainers(a_Elements, a_TextAlign) {
let i;
let img_src = '';
let re = new RegExp("(https?:\/\/.*\.(?:png|jpg))");
for (i in a_Elements) {
let e = a_Elements[i];
if (e.nodeName == 'IMG' && re.test(e.src)) {
img_src = e.src;
}
let children = e.querySelectorAll("*");
for(let i = 0; i < children.length; i++){
let c = children[i];
if (c.nodeName == 'IMG' && re.test(c.src)) {
img_src = c.src;
}
}
}
if (img_src.length > 0) {
return '<p style = "text-align:' + a_TextAlign + ';"><img src = "'+ img_src + '" width = "600px"/></p>';
}
return '';
}
function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) {
let result = '';
let element_index = 1;
for (var i in a_Elements) {
let e = a_Elements[i];
result += a_FinishWorkFunc(e, element_index);
element_index += 1;
}
return result;
}
// Фильтрация элементов
function FIlterElements(a_Elements, a_ElementChecker) {
let result = [];
for (let i = 0; i < a_Elements.length; i++) {
let e = a_Elements[i];
if (a_ElementChecker(e)) {
result.push(e);
}
}
return result;
}
function ElementCheckerTrue(a_Element) {
return true;
}
function ElementCheckerFalse(a_Element) {
return false;
}
function ElementCheckerRia(a_Element) {
if (a_Element.dataset.type == 'article' || a_Element.dataset.type == 'banner' || a_Element.dataset.type == 'media' || a_Element.dataset.type == 'video') {
return false;
}
return true;
}
function ElementCheckerZV(a_Element) {
if (a_Element.itemprop == 'articleBody') {
return true;
}
return false;
}
function SubElementCheckerToRemoveTemplate(a_Classes, a_NodeNames) {
function SubElementCheckerToRemove(a_Element) {
if (!a_Element || !a_Element.parentNode) {
return false;
}
if (CheckRegExp(GetElementClassName, '.*(' + a_Classes + ').*', a_Element)) {
return true;
}
if ((a_NodeNames && CheckRegExp(GetNodeName, '(' + a_NodeNames + ')', a_Element))) {
return true;
}
return false;
}
return SubElementCheckerToRemove
}
// Обработка элементов
function GrubTextFuncTemplate() {
function GrubTextFunc(a_Element) {
var content = '';
if (a_Element.innerText) {
content = TrimString(a_Element.textContent);
}
return content;
}
return GrubTextFunc
}
function RemoveAllAttributes(a_Element) {
let new_el = document.createElement(a_Element.nodeName);
new_el.innerHTML = a_Element.innerHTML;
a_Element.outherHTML = new_el.outherHTML;
}
function RemoveCurrentElementSaveChild(a_Element, a_Parent) {
var parent = a_Element.parentNode || a_Parent;
while(a_Element.firstChild) parent.insertBefore(a_Element.firstChild, a_Element);
parent.removeChild(a_Element);
}
function GetClearHtml(a_Element, a_OutTag, a_TextAlign, a_SubElementCheckerToRemove, a_ClearTextFunc, a_ElementIndex) {
let clear_element = a_Element.cloneNode(true);
let elements = clear_element.querySelectorAll('*');
elements.forEach(function (element) {
if (a_SubElementCheckerToRemove && a_SubElementCheckerToRemove(element)) {
element.parentNode.removeChild(element);
}
else {
RemoveAllAttributes(element);
element.removeAttribute('class');
element.removeAttribute('id');
}
});
let tags_to_delete = ['div', 'span', 'em', 'svg', 'path'];
tags_to_delete.forEach(function (del_tag_name) {
let elements = clear_element.querySelectorAll(del_tag_name);
elements.forEach(function (element) {
RemoveCurrentElementSaveChild(element, clear_element);
});
});
elements = clear_element.querySelectorAll('a');
elements.forEach(function (element) {
if (element.host == location.hostname) {
RemoveCurrentElementSaveChild(element, clear_element);
}
});
let tags_to_align = ['p', 'h2'];
tags_to_align.forEach(function (align_tag_name) {
let elements = clear_element.querySelectorAll(align_tag_name);
elements.forEach(function (element) {
element.style.textAlign = a_TextAlign;
if (TrimString(element.innerHTML).replace('&nbsp;', '') == '') {
element.parentNode.removeChild(element);
}
});
});
let result = TrimString(clear_element.innerHTML);
if (a_ClearTextFunc) {
result = a_ClearTextFunc(result, a_ElementIndex);
}
if (a_OutTag && a_TextAlign && TrimString(result).replace('&nbsp;', '') != '') {
result = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + result + '</' + a_OutTag + '>';
}
return result;
}
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc, a_SubElementCheckerToRemove) {
function FinishWorkFunc(a_Element, a_ElementIndex) {
let out_tag = a_OutTag;
if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) {
out_tag = 'h2';
}
let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign, a_SubElementCheckerToRemove, a_ClearTextFunc, a_ElementIndex);
if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') {
a_Content = '<blockquote>' + a_Content + '</blockquote>';
}
return a_Content;
}
return FinishWorkFunc
}
function FinishWorkFuncZV(a_Element, a_ElementIndex) {
return GrubTextFuncTemplate()(a_Element);
}
// Создание контента для стандартных новостей
const title_tag = 'h2';
const title_finish_text_func = FinishWorkFuncTemplate(title_tag, 'center');
function MakeContentByNews(a_BaseElementTitle, a_BaseElementImage, a_BaseElementText, a_TitleRegExpElementPattern, a_ImageRegExpElementPattern, a_TextRegExpElementPattern, a_ElementChecker, a_SubElementCheckerToRemove, a_ClearTextPatterns) {
const p_tag = 'p';
const grub_func = GrubTextFuncTemplate();
let content = '';
const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns, true), a_SubElementCheckerToRemove);
content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_func, title_finish_text_func);
content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center');
content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_func, paragraph_finish_text_func);
return content;
}
// Создание контента для сайта
function MakeContent() {
let content = '';
let source_add = true;
const zero_tag_func = FinishWorkFuncTemplate()
const grub_text_func = GrubTextFuncTemplate()
let host_name = null;
if (location.hostname == 'tass.ru') {
// test: https://tass.ru/proisshestviya/19117971
const base_element = document.getElementById('content_box');
content = MakeContentByNews(
base_element,
base_element,
base_element,
'(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*',
'Image_wrapper_.*',
'(Paragraph_paragraph|Title_title).*',
ElementCheckerTrue,
ElementCheckerFalse,
['/ТАСС/. ']
);
}
else if (location.hostname == 'ria.ru') {
// test: https://ria.ru/20231020/ssha-1904210900.html
const base_element = document.getElementsByClassName('article__header')[0];
const base_element_text = document.getElementsByClassName('article__body')[0];
const tire = ['-', '–', '—', '‒', '―', '⸺', '⸻'];
let clear_text = [];
for (let i in tire) {
let t = tire[i];
clear_text.push(t + ' РИА Новости. ');
}
content = MakeContentByNews(
base_element,
base_element,
base_element_text,
'article__title',
'photoview__open',
'article__block',
ElementCheckerRia,
ElementCheckerFalse,
['</strong>']
);
}
else if (location.hostname == 'rg.ru') {
// test: https://rg.ru/2023/10/28/volontery-iz-evrosoiuza-privezli-dlia-zhitelej-donbassa-20-tonn-gumanitarnogo-gruza.html
const base_element = document.getElementsByClassName('article__header')[0];
const base_element_text = document.getElementsByClassName('article__body')[0];
content = MakeContentByNews(
document,
document,
base_element_text,
'.*Content_title.*',
'.*(Content_image|RgPhotoreportClassic).*',
'(PageContentCommonStyling_text|.*Content_lead).*',
ElementCheckerTrue,
SubElementCheckerToRemoveTemplate(
'portal|rg-incut|article-img|Section',
'RG-VIDEO|RG-INCUT|RG-PHOTOREPORT'
),
[]
);
}
else if (location.hostname == 'russian.rt.com') {
// test: https://russian.rt.com/business/article/1222163-centrobank-stavka-oktyabr-2023
const base_element = document.getElementsByClassName('article article_article-page')[0];
const base_element_text = document.getElementsByClassName('article__body')[0];
content = MakeContentByNews(
base_element,
base_element,
base_element_text,
'article__heading',
'article__cover article__cover_article-page',
'article__text',
ElementCheckerTrue,
SubElementCheckerToRemoveTemplate(
'read-more|article__cover'
),
[]
);
if (content.length == 0) {
// test: https://russian.rt.com/inotv/2023-10-27/DELFI-Latviya-budet-konfiskovivat-mashini
const base_element_title = document.getElementsByClassName('left-column page')[0].getElementsByTagName("h1")[0];
const base_element_image = document.getElementsByTagName("figure")[0];
const base_element_text = document.getElementsByTagName("article")[0];
content = title_finish_text_func(base_element_title) +
MakeContentByNews(
base_element_title,
base_element_image,
base_element_text,
'!!!!',
'.*',
'article-intro|article-body',
ElementCheckerTrue,
SubElementCheckerToRemoveTemplate(
'meta',
'IMG'
),
[]
);
if (content.length > 0) {
host_name = 'inotv';
}
}
}
else if (location.hostname == 'www.cnews.ru') {
// test: https://www.cnews.ru/news/top/2023-10-27_rossiyane_sozdali_polnotsennyj
const base_element = document.getElementsByClassName('news_containere')[0];
content = MakeContentByNews(
base_element,
base_element,
document,
'!!!', // Нет названия
'img-block',
'news_container',
ElementCheckerTrue,
SubElementCheckerToRemoveTemplate(
'article-top-author|article-menu_base|d-flex|img-block|NewsBodyLeftInclude|mobile-zone|other-news-note|cnLike|article-bottom-info|banner|comments_all',
'NOINDEX|BR'
),
[]
);
}
else if (location.hostname == 'mixednews.ru') {
// test: https://mixednews.ru/archives/180224
const base_element = document.getElementsByClassName('entry-header')[0];
content = MakeContentByNews(
base_element,
document,
document,
'entry-title', // Нет названия
'entry-content',
'entry-content',
ElementCheckerTrue,
SubElementCheckerToRemoveTemplate(
'ssba',
'NOSCRIPT|SCRIPT|BR|IMG|!--'
),
[]
);
}
else if (location.hostname == 'zakonvremeni.ru') {
const base_element = document.getElementsByClassName('item-page')[0];
const title = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'page-header', base_element), grub_text_func, FinishWorkFuncZV);
const parent_category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'parent-category-name', base_element), grub_text_func, FinishWorkFuncZV);
const category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'category-name', base_element), grub_text_func, FinishWorkFuncZV);
const page = RemoveAfterSplitter(TrimString(document.getElementsByClassName('item-page')[0].querySelector('[itemprop=articleBody]').textContent), '.', true);
content = title + '\n' + parent_category + ' ' + category + '\n\n' + page + '\n' + document.URL;
source_add = false;
}
let result = '';
if (content.length > 0) {
result = '<textarea id = "news_content" rows="10" cols="100">' + content;
if (source_add) {
result += '<p style="text-align: justify;">Источник: <a href = "' + ClearUrl(document.URL) + '">' + (host_name || location.hostname) + '</a></p>';
}
result += '</textarea>';
}
return result;
}
let content = MakeContent();
let logo = document.createElement("div");
logo.innerHTML = '<div style="margin: 0pt auto; width: 800px; text-align: center;">' + content + '</div>';
document.body.insertBefore(logo, document.body.firstChild);
})();