You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
520 lines
20 KiB
520 lines
20 KiB
// ==UserScript== |
|
// @name News parser |
|
// @namespace http://zakonvremeni.ru |
|
// @version 0.3.5 |
|
// @description Parse news |
|
// @author AlexeiBv+mirocod@narod.ru |
|
// @match https://tass.ru/* |
|
// @match https://ria.ru/* |
|
// @match https://rg.ru/* |
|
// @match https://www.cnews.ru/* |
|
// @match https://mixednews.ru/* |
|
// @match https://russian.rt.com/* |
|
// @match https://iz.ru/* |
|
// @match https://zakonvremeni.ru/* |
|
// @icon https://icons.duckduckgo.com/ip2/zakonvremeni.ru.ico |
|
// @grant none |
|
// ==/UserScript== |
|
|
|
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru> |
|
|
|
(function() { |
|
'use strict'; |
|
|
|
// Поиск элементов по регулярному выражению |
|
|
|
function GetElementClassName(a_Element) { |
|
return a_Element.className; |
|
} |
|
|
|
function GetNodeName(a_Element) { |
|
return a_Element.nodeName; |
|
} |
|
|
|
function CheckRegExp(a_GetElementNameFunc, a_RegExpPattern, a_Element) { |
|
let re = new RegExp("(?:^|\\s)" + a_RegExpPattern + "(?!\\S)"); |
|
return re.test(a_GetElementNameFunc(a_Element)); |
|
} |
|
|
|
function FindElementsByRegExp(a_GetElementNameFunc, a_RegExpPattern, a_ElementParent) { |
|
a_ElementParent || (a_ElementParent=document); |
|
let descendants = a_ElementParent.getElementsByTagName('*'), i=-1, e, result=[]; |
|
while (e = descendants[++i]) { |
|
if (CheckRegExp(a_GetElementNameFunc, a_RegExpPattern, e)){ |
|
result.push(e); |
|
} |
|
} |
|
return result; |
|
} |
|
|
|
// Работа со строками |
|
|
|
function TrimString(str) { |
|
return ( str || '' ).replace( /^\s+|\s+$/g, '' ); |
|
} |
|
|
|
function RemoveBeforeSplitter(a_String, a_Splitter) { |
|
let index = a_String.indexOf(a_Splitter) |
|
if (index != -1) { |
|
return a_String.substring(index + a_Splitter.length); |
|
} |
|
return a_String; |
|
} |
|
|
|
function RemoveAfterSplitter(a_String, a_Splitter, a_SaveSplitter) { |
|
let index = a_String.indexOf(a_Splitter) |
|
if (index != -1) { |
|
let spl_len = a_Splitter.length |
|
if (!a_SaveSplitter) { |
|
spl_len = 0; |
|
} |
|
return a_String.substring(0, index + spl_len); |
|
} |
|
return a_String; |
|
} |
|
|
|
function ClearUrl(a_Url) { |
|
const separator = '?'; |
|
return RemoveAfterSplitter(a_Url, separator, false); |
|
} |
|
|
|
function ClearWWW(a_Url) { |
|
return ( a_Url || '' ).replace( 'www.', '' ); |
|
} |
|
|
|
function ClearTextFuncTemplate(a_RemoveBeforeList, a_OnlyFirstIndex) { |
|
function ClearTextFunc(a_Content, a_ElementIndex) { |
|
if (a_OnlyFirstIndex && a_ElementIndex && a_ElementIndex != 1) { |
|
return a_Content; |
|
} |
|
let content = a_Content; |
|
for (let i = 0; i < a_RemoveBeforeList.length; i++) { |
|
let r = a_RemoveBeforeList[i]; |
|
content = RemoveBeforeSplitter(content, r); |
|
} |
|
return content; |
|
} |
|
return ClearTextFunc |
|
} |
|
|
|
// Работа с контейнерами |
|
|
|
function GetImageInContainers(a_Elements, a_TextAlign) { |
|
let i; |
|
let img_src = ''; |
|
let re = new RegExp("(https?:\/\/.*\.(?:png|jpg))"); |
|
for (i in a_Elements) { |
|
let e = a_Elements[i]; |
|
if (e.nodeName == 'IMG' && re.test(e.src)) { |
|
img_src = e.src; |
|
} |
|
let children = e.querySelectorAll("*"); |
|
for(let i = 0; i < children.length; i++){ |
|
let c = children[i]; |
|
if (c.nodeName == 'IMG' && re.test(c.src)) { |
|
img_src = c.src; |
|
} |
|
} |
|
if (img_src.length > 0) { |
|
break; |
|
} |
|
} |
|
if (img_src.length > 0) { |
|
return '<p style = "text-align:' + a_TextAlign + ';"><img src = "'+ ClearUrl(img_src) + '" width = "600px"/></p>'; |
|
} |
|
return ''; |
|
} |
|
|
|
function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) { |
|
let result = ''; |
|
let element_index = 1; |
|
for (var i in a_Elements) { |
|
let e = a_Elements[i]; |
|
|
|
result += a_FinishWorkFunc(e, element_index); |
|
element_index += 1; |
|
} |
|
return result; |
|
} |
|
|
|
// Фильтрация элементов |
|
|
|
function FIlterElements(a_Elements, a_ElementChecker) { |
|
let result = []; |
|
for (let i = 0; i < a_Elements.length; i++) { |
|
let e = a_Elements[i]; |
|
if (a_ElementChecker(e)) { |
|
result.push(e); |
|
} |
|
} |
|
|
|
return result; |
|
} |
|
|
|
function ElementCheckerTrue(a_Element) { |
|
return true; |
|
} |
|
|
|
function ElementCheckerFalse(a_Element) { |
|
return false; |
|
} |
|
|
|
function ElementCheckerRia(a_Element) { |
|
if (a_Element.dataset.type == 'article' || a_Element.dataset.type == 'banner' || a_Element.dataset.type == 'media' || a_Element.dataset.type == 'video' || a_Element.dataset.type == 'photolenta') { |
|
return false; |
|
} |
|
|
|
return true; |
|
} |
|
|
|
function ElementCheckerZV(a_Element) { |
|
if (a_Element.itemprop == 'articleBody') { |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
function SubElementCheckerToRemoveTemplate(a_Classes, a_NodeNames) { |
|
function SubElementCheckerToRemove(a_Element) { |
|
if (!a_Element || !a_Element.parentNode) { |
|
return false; |
|
} |
|
if (CheckRegExp(GetElementClassName, '.*(' + a_Classes + ').*', a_Element)) { |
|
return true; |
|
} |
|
if ((a_NodeNames && CheckRegExp(GetNodeName, '(' + a_NodeNames + ')', a_Element))) { |
|
return true; |
|
} |
|
return false; |
|
} |
|
return SubElementCheckerToRemove |
|
} |
|
|
|
// Обработка элементов |
|
|
|
function GrubTextFuncTemplate() { |
|
function GrubTextFunc(a_Element) { |
|
var content = ''; |
|
if (a_Element.innerText) { |
|
content = TrimString(a_Element.textContent); |
|
} |
|
return content; |
|
} |
|
return GrubTextFunc |
|
} |
|
|
|
function RemoveAllAttributes(a_Element) { |
|
let new_el = document.createElement(a_Element.nodeName); |
|
new_el.innerHTML = a_Element.innerHTML; |
|
a_Element.outherHTML = new_el.outherHTML; |
|
} |
|
|
|
function RemoveCurrentElementSaveChild(a_Element, a_Parent) { |
|
var parent = a_Element.parentNode || a_Parent; |
|
while(a_Element.firstChild) parent.insertBefore(a_Element.firstChild, a_Element); |
|
parent.removeChild(a_Element); |
|
} |
|
|
|
function RemoveCommentsHTML(a_String){ |
|
return ( a_String || '' ).replace( /(<!--.*?-->)|(<!--[\S\s]+?-->)|(<!--[\S\s]*?$)/g, '' ); |
|
} |
|
|
|
function GetClearHtml(a_Element, a_OutTag, a_TextAlign, a_SubElementCheckerToRemove, a_ClearTextFunc, a_ElementIndex) { |
|
let clear_element = a_Element.cloneNode(true); |
|
|
|
let elements = clear_element.querySelectorAll('*'); |
|
elements.forEach(function (element) { |
|
if (a_SubElementCheckerToRemove && a_SubElementCheckerToRemove(element)) { |
|
element.parentNode.removeChild(element); |
|
} |
|
else { |
|
RemoveAllAttributes(element); |
|
element.removeAttribute('class'); |
|
element.removeAttribute('id'); |
|
} |
|
}); |
|
|
|
let tags_to_delete = ['div', 'span', 'em', 'svg', 'path']; |
|
|
|
tags_to_delete.forEach(function (del_tag_name) { |
|
let elements = clear_element.querySelectorAll(del_tag_name); |
|
elements.forEach(function (element) { |
|
RemoveCurrentElementSaveChild(element, clear_element); |
|
}); |
|
}); |
|
|
|
elements = clear_element.querySelectorAll('a'); |
|
elements.forEach(function (element) { |
|
if (element.host == location.hostname) { |
|
RemoveCurrentElementSaveChild(element, clear_element); |
|
} |
|
}); |
|
|
|
let tags_to_align = ['p', 'h2', 'li']; |
|
|
|
tags_to_align.forEach(function (align_tag_name) { |
|
let elements = clear_element.querySelectorAll(align_tag_name); |
|
elements.forEach(function (element) { |
|
element.style.textAlign = a_TextAlign; |
|
if (TrimString(element.innerHTML).replace(' ', '') == '') { |
|
element.parentNode.removeChild(element); |
|
} |
|
}); |
|
}); |
|
|
|
let result = TrimString(RemoveCommentsHTML(clear_element.innerHTML)); |
|
if (a_ClearTextFunc) { |
|
result = a_ClearTextFunc(result, a_ElementIndex); |
|
} |
|
if (a_OutTag && a_TextAlign && TrimString(result).replace(' ', '') != '') { |
|
result = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + result + '</' + a_OutTag + '>'; |
|
} |
|
return result; |
|
} |
|
|
|
|
|
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc, a_SubElementCheckerToRemove) { |
|
function FinishWorkFunc(a_Element, a_ElementIndex) { |
|
let out_tag = a_OutTag; |
|
if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) { |
|
out_tag = 'h2'; |
|
} |
|
let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign, a_SubElementCheckerToRemove, a_ClearTextFunc, a_ElementIndex); |
|
|
|
if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') { |
|
a_Content = '<blockquote>' + a_Content + '</blockquote>'; |
|
} |
|
|
|
return a_Content; |
|
} |
|
return FinishWorkFunc |
|
} |
|
|
|
function FinishWorkFuncZV(a_Element, a_ElementIndex) { |
|
return GrubTextFuncTemplate()(a_Element); |
|
} |
|
|
|
// Создание контента для стандартных новостей |
|
const title_tag = 'h2'; |
|
const title_finish_text_func = FinishWorkFuncTemplate(title_tag, 'center'); |
|
|
|
function MakeContentByNews(a_BaseElementTitle, a_BaseElementImage, a_BaseElementText, a_TitleRegExpElementPattern, a_ImageRegExpElementPattern, a_TextRegExpElementPattern, a_ElementChecker, a_SubElementCheckerToRemove, a_ClearTextPatterns) { |
|
const p_tag = 'p'; |
|
const grub_func = GrubTextFuncTemplate(); |
|
|
|
let content = ''; |
|
const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns, true), a_SubElementCheckerToRemove); |
|
content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_func, title_finish_text_func); |
|
content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center'); |
|
content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_func, paragraph_finish_text_func); |
|
return content; |
|
} |
|
|
|
// Создание контента для сайта |
|
|
|
function MakeContent() { |
|
let content = ''; |
|
let source_add = true; |
|
const zero_tag_func = FinishWorkFuncTemplate() |
|
const grub_text_func = GrubTextFuncTemplate() |
|
let host_name = null; |
|
|
|
if (location.hostname == 'tass.ru') { |
|
// test: https://tass.ru/proisshestviya/19117971 |
|
const base_element = document.getElementById('content_box'); |
|
content = MakeContentByNews( |
|
base_element, |
|
base_element, |
|
base_element, |
|
'(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*', |
|
'Image_wrapper_.*', |
|
'(Paragraph_paragraph|Title_title).*', |
|
ElementCheckerTrue, |
|
ElementCheckerFalse, |
|
['/ТАСС/. '] |
|
); |
|
} |
|
else if (location.hostname == 'ria.ru') { |
|
// test: https://ria.ru/20231020/ssha-1904210900.html |
|
const base_element = document.getElementsByClassName('article__header')[0]; |
|
const base_element_text = document.getElementsByClassName('article__body')[0]; |
|
const tire = ['-', '–', '—', '‒', '―', '⸺', '⸻']; |
|
let clear_text = []; |
|
for (let i in tire) { |
|
let t = tire[i]; |
|
clear_text.push(t + ' РИА Новости. '); |
|
} |
|
|
|
content = MakeContentByNews( |
|
base_element, |
|
base_element, |
|
base_element_text, |
|
'article__title', |
|
'photoview__open', |
|
'article__block', |
|
ElementCheckerRia, |
|
ElementCheckerFalse, |
|
['</strong>'] |
|
); |
|
} |
|
else if (location.hostname == 'rg.ru') { |
|
// test: https://rg.ru/2023/10/28/volontery-iz-evrosoiuza-privezli-dlia-zhitelej-donbassa-20-tonn-gumanitarnogo-gruza.html |
|
const base_element = document.getElementsByClassName('article__header')[0]; |
|
const base_element_text = document.getElementsByClassName('article__body')[0]; |
|
|
|
content = MakeContentByNews( |
|
document, |
|
document, |
|
base_element_text, |
|
'.*Content_title.*', |
|
'.*(Content_image|RgPhotoreportClassic).*', |
|
'(PageContentCommonStyling_text|.*Content_lead).*', |
|
ElementCheckerTrue, |
|
SubElementCheckerToRemoveTemplate( |
|
'portal|rg-incut|article-img|Section', |
|
'RG-VIDEO|RG-INCUT|RG-PHOTOREPORT' |
|
), |
|
[] |
|
); |
|
} |
|
else if (location.hostname == 'russian.rt.com') { |
|
// test: https://russian.rt.com/business/article/1222163-centrobank-stavka-oktyabr-2023 |
|
const base_element = document.getElementsByClassName('article article_article-page')[0]; |
|
const base_element_text = document.getElementsByClassName('article__body')[0]; |
|
|
|
content = MakeContentByNews( |
|
base_element, |
|
base_element, |
|
base_element_text, |
|
'article__heading', |
|
'article__cover article__cover_article-page', |
|
'article__text', |
|
ElementCheckerTrue, |
|
SubElementCheckerToRemoveTemplate( |
|
'read-more|article__cover' |
|
), |
|
[] |
|
); |
|
|
|
if (content.length == 0) { |
|
// test: https://russian.rt.com/inotv/2023-10-27/DELFI-Latviya-budet-konfiskovivat-mashini |
|
const base_element_title = document.getElementsByClassName('left-column page')[0].getElementsByTagName("h1")[0]; |
|
const base_element_image = document.getElementsByTagName("figure")[0]; |
|
const base_element_text = document.getElementsByTagName("article")[0]; |
|
|
|
content = title_finish_text_func(base_element_title) + |
|
MakeContentByNews( |
|
base_element_title, |
|
base_element_image, |
|
base_element_text, |
|
'!!!!', |
|
'.*', |
|
'article-intro|article-body', |
|
ElementCheckerTrue, |
|
SubElementCheckerToRemoveTemplate( |
|
'meta', |
|
'IMG' |
|
), |
|
[] |
|
); |
|
if (content.length > 0) { |
|
host_name = 'inotv'; |
|
} |
|
} |
|
} |
|
else if (location.hostname == 'www.cnews.ru') { |
|
// test: https://www.cnews.ru/news/top/2023-10-27_rossiyane_sozdali_polnotsennyj |
|
const base_element = document.getElementsByClassName('news_containere')[0]; |
|
|
|
content = MakeContentByNews( |
|
base_element, |
|
base_element, |
|
document, |
|
'!!!', // Нет названия |
|
'img-block', |
|
'news_container', |
|
ElementCheckerTrue, |
|
SubElementCheckerToRemoveTemplate( |
|
'article-top-author|article-menu_base|d-flex|img-block|NewsBodyLeftInclude|mobile-zone|other-news-note|cnLike|article-bottom-info|banner|comments_all', |
|
'NOINDEX|BR' |
|
), |
|
[] |
|
); |
|
} |
|
else if (location.hostname == 'mixednews.ru') { |
|
// test: https://mixednews.ru/archives/180224 |
|
const base_element = document.getElementsByClassName('entry-header')[0]; |
|
|
|
content = MakeContentByNews( |
|
base_element, |
|
document, |
|
document, |
|
'entry-title', // Нет названия |
|
'entry-content', |
|
'entry-content', |
|
ElementCheckerTrue, |
|
SubElementCheckerToRemoveTemplate( |
|
'ssba', |
|
'NOSCRIPT|SCRIPT|BR|IMG|!--' |
|
), |
|
[] |
|
); |
|
} |
|
else if (location.hostname == 'iz.ru') { |
|
// https://iz.ru/1639291/2024-01-24/amerikanskii-esminetc-uss-john-finn-proshel-cherez-taivanskii-proliv |
|
const base_element = document.getElementById('block-purple-content'); |
|
|
|
content = MakeContentByNews( |
|
base_element, |
|
base_element, |
|
base_element, |
|
'm-t-10|top_big_img_article__info__inside__title', |
|
'big_photo__img|top_big_img_article__img', |
|
'text-article__inside', |
|
ElementCheckerTrue, |
|
SubElementCheckerToRemoveTemplate( |
|
'more_style_one|igi-player|share_bottom|recommendation-block|slider-block|layer-' //, 'DIV|IFRAME' |
|
), |
|
[] |
|
); |
|
} |
|
else if (location.hostname == 'zakonvremeni.ru') { |
|
const base_element = document.getElementsByClassName('item-page')[0]; |
|
const title = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'page-header', base_element), grub_text_func, FinishWorkFuncZV); |
|
const parent_category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'parent-category-name', base_element), grub_text_func, FinishWorkFuncZV); |
|
const category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'category-name', base_element), grub_text_func, FinishWorkFuncZV); |
|
const page = RemoveAfterSplitter(TrimString(document.getElementsByClassName('item-page')[0].querySelector('[itemprop=articleBody]').textContent), '.', true); |
|
content = title + '\n' + parent_category + ' ' + category + '\n\n' + page + '\n' + document.URL; |
|
source_add = false; |
|
} |
|
|
|
let result = ''; |
|
if (content.length > 0) { |
|
result = '<textarea id = "news_content" rows="10" cols="100">' + content; |
|
if (source_add) { |
|
result += '<p style="text-align: justify;">Источник: <a href = "' + ClearUrl(document.URL) + '">' + ClearWWW(host_name || location.hostname) + '</a></p>'; |
|
} |
|
result += '</textarea>'; |
|
} |
|
return result; |
|
} |
|
|
|
let content = MakeContent(); |
|
let news_text = document.createElement("div"); |
|
news_text.innerHTML = '<div style="margin: 0pt auto; width: 800px; text-align: center;">' + content + '</div>'; |
|
|
|
if (location.hostname == 'iz.ru') { |
|
let top_panel = document.getElementsByClassName('top-panel')[0]; |
|
top_panel.parentNode.removeChild(top_panel); |
|
let base_child = document.getElementsByClassName('m-t-10')[0]; |
|
if (!base_child) { |
|
base_child = document.getElementsByClassName('top_big_img_article__info__inside__title')[0]; |
|
} |
|
base_child.appendChild(news_text); |
|
} |
|
else { |
|
let base_child = document.body.firstChild |
|
document.body.insertBefore(news_text, base_child); |
|
} |
|
|
|
})();
|
|
|