// ==UserScript== // @name News parser // @namespace http://zakonvremeni.ru // @version 0.2 // @description Parse news // @author AlexeiBv+mirocod@narod.ru // @match https://tass.ru/* // @match https://ria.ru/* // @match https://rg.ru/* // @match https://russian.rt.com/* // @match https://zakonvremeni.ru/* // @icon https://icons.duckduckgo.com/ip2/zakonvremeni.ru.ico // @grant none // ==/UserScript== // Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) (function() { 'use strict'; // Поиск элементов по регулярному выражению function GetElementClassName(a_Element) { return a_Element.className; } function GetNodeName(a_Element) { return a_Element.nodeName; } function CheckRegExp(a_GetElementNameFunc, a_RegExpPattern, a_Element) { let re = new RegExp("(?:^|\\s)" + a_RegExpPattern + "(?!\\S)"); return re.test(a_GetElementNameFunc(a_Element)); } function FindElementsByRegExp(a_GetElementNameFunc, a_RegExpPattern, a_ElementParent) { a_ElementParent || (a_ElementParent=document); let descendants = a_ElementParent.getElementsByTagName('*'), i=-1, e, result=[]; while (e = descendants[++i]) { if (CheckRegExp(a_GetElementNameFunc, a_RegExpPattern, e)){ result.push(e); } } return result; } // Работа со строками function TrimString(s) { return ( s || '' ).replace( /^\s+|\s+$/g, '' ); } function RemoveBeforeSplitter(a_String, a_Splitter) { let index = a_String.indexOf(a_Splitter) if (index != -1) { return a_String.substring(index + a_Splitter.length); } return a_String; } function RemoveAfterSplitter(a_String, a_Splitter, a_SaveSplitter) { let index = a_String.indexOf(a_Splitter) if (index != -1) { let spl_len = a_Splitter.length if (!a_SaveSplitter) { spl_len = 0; } return a_String.substring(0, index + spl_len); } return a_String; } function ClearUrl(a_Url) { const separator = '?'; return RemoveAfterSplitter(a_Url, separator, false); } function ClearTextFuncTemplate(a_RemoveBeforeList) { function ClearTextFunc(a_Content) { let content = a_Content; for (let i = 0; i < a_RemoveBeforeList.length; i++) { let r = a_RemoveBeforeList[i]; content = RemoveBeforeSplitter(content, r); } return content; } return ClearTextFunc } // Работа с контейнерами function GetImageInContainers(a_Elements, a_TextAlign) { let i; let img_src = ''; let re = new RegExp("(https?:\/\/.*\.(?:png|jpg))"); for (i in a_Elements) { let e = a_Elements[i]; let children = e.querySelectorAll("*"); for(let i = 0; i < children.length; i++){ var c = children[i]; if (c.nodeName == 'IMG' && re.test(c.src)) { img_src = c.src; } } } if (img_src.length > 0) { return '

'; } return ''; } function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) { let result = ''; for (var i in a_Elements) { let e = a_Elements[i]; result += a_FinishWorkFunc(e); } return result; } // Фильтрация элементов function FIlterElements(a_Elements, a_ElementChecker) { let result = []; for (let i = 0; i < a_Elements.length; i++) { let e = a_Elements[i]; if (a_ElementChecker(e)) { result.push(e); } } return result; } function ElementCheckerTrue(a_Element) { return true; } function ElementCheckerRia(a_Element) { if (a_Element.dataset.type == 'article' || a_Element.dataset.type == 'banner' || a_Element.dataset.type == 'media') { return false; } return true; } function ElementCheckerZV(a_Element) { if (a_Element.itemprop == 'articleBody') { return true; } return false; } // Обработка элементов function GrubTextFuncTemplate() { function GrubTextFunc(a_Element) { var content = ''; if (a_Element.innerText) { content = TrimString(a_Element.textContent); } return content; } return GrubTextFunc } function RemoveAllAttributes(a_Element) { let new_el = document.createElement(a_Element.nodeName); new_el.innerHTML = a_Element.innerHTML; a_Element.outherHTML = new_el.outherHTML; } function RemoveCurrentElementSaveChild(a_Element, a_Parent) { var parent = a_Element.parentNode || a_Parent; while(a_Element.firstChild) parent.insertBefore(a_Element.firstChild, a_Element); parent.removeChild(a_Element); } function GetClearHtml(a_Element, a_OutTag, a_TextAlign) { let clear_element = a_Element.cloneNode(true); let elements = clear_element.querySelectorAll('*'); elements.forEach(function (element) { RemoveAllAttributes(element); element.removeAttribute('class'); if (element && element.parentNode && element.nodeName == 'DIV' && CheckRegExp(GetElementClassName, '.*(read-more|article__cover).*', element)) { element.parentNode.removeChild(element); } }); let tags_to_delete = ['div', 'span', 'em', 'svg', 'path']; tags_to_delete.forEach(function (del_tag_name) { let elements = clear_element.querySelectorAll(del_tag_name); elements.forEach(function (element) { RemoveCurrentElementSaveChild(element, clear_element); }); }); elements = clear_element.querySelectorAll('a'); elements.forEach(function (element) { if (element.host == location.hostname) { RemoveCurrentElementSaveChild(element, clear_element); } }); let tags_to_align = ['p', 'h2']; tags_to_align.forEach(function (align_tag_name) { let elements = clear_element.querySelectorAll(align_tag_name); elements.forEach(function (element) { element.style.textAlign = a_TextAlign; if (element.innerHTML == ' ' || element.innerHTML == '') { element.parentNode.removeChild(element); } }); }); let result = clear_element.innerHTML; if (a_OutTag && a_TextAlign) { result = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + result + ''; } return result; } function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) { function FinishWorkFunc(a_Element) { let out_tag = a_OutTag; if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) { out_tag = 'h2'; } let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign) if (a_ClearTextFunc) { a_Content = a_ClearTextFunc(a_Content); } if (a_Element && CheckRegExp(GetElementClassName, '(PageContentCommonStyling_text.*)', a_Element)) { let content = ''; let childrens = FindElementsByRegExp(GetNodeName, '(P)', a_Element); for (let i = 0; i < childrens.length; i++) { let c = childrens[i]; content += FinishWorkFunc(c); } return content; } if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') { a_Content = '
' + a_Content + '
'; } return a_Content; } return FinishWorkFunc } // Создание контента для стандартных новостей function MakeContentByNews(a_BaseElementTitle, a_BaseElementImage, a_BaseElementText, a_TitleRegExpElementPattern, a_ImageRegExpElementPattern, a_TextRegExpElementPattern, a_ElementChecker, a_ClearTextPatterns) { const title_tag = 'h2'; const p_tag = 'p'; const title_finish_text_func = FinishWorkFuncTemplate(title_tag, 'center') const grub_func = GrubTextFuncTemplate(); let content = ''; const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns)); content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_func, title_finish_text_func); content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center'); content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_func, paragraph_finish_text_func); return content; } // Создание контента для сайта function MakeContent() { let content = ''; let source_add = true; const zero_tag_func = FinishWorkFuncTemplate() const grub_text_func = GrubTextFuncTemplate() if (location.hostname == 'tass.ru') { // test: https://tass.ru/proisshestviya/19117971 const base_element = document.getElementById('content_box'); content = MakeContentByNews( base_element, base_element, base_element, '(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*', 'Image_wrapper_.*', '(Paragraph_paragraph|Title_title).*', ElementCheckerTrue, ['/ТАСС/. '] ); } else if (location.hostname == 'ria.ru') { // test: https://ria.ru/20231020/ssha-1904210900.html const base_element = document.getElementsByClassName('article__header')[0]; const base_element_text = document.getElementsByClassName('article__body')[0]; const tire = ['-', '–', '—', '‒', '―', '⸺', '⸻']; let clear_text = []; for (let i in tire) { let t = tire[i]; clear_text.push(t + ' РИА Новости. '); } content = MakeContentByNews( base_element, base_element, base_element_text, 'article__title', 'photoview__open', 'article__block', ElementCheckerRia, [''] ); } else if (location.hostname == 'rg.ru') { const base_element = document.getElementsByClassName('article__header')[0]; const base_element_text = document.getElementsByClassName('article__body')[0]; content = MakeContentByNews( document, document, base_element_text, 'PageArticleContent_title.*', 'PageArticleContent_image.*', '(PageContentCommonStyling_text|PageArticleContent_lead).*', ElementCheckerRia, [] ); } else if (location.hostname == 'russian.rt.com') { // test: https://russian.rt.com/business/article/1222163-centrobank-stavka-oktyabr-2023 const base_element = document.getElementsByClassName('article article_article-page')[0]; const base_element_text = document.getElementsByClassName('article__body')[0]; content = MakeContentByNews( base_element, base_element, base_element_text, 'article__heading', 'article__cover article__cover_article-page', 'article__text', ElementCheckerRia, [] ); } else if (location.hostname == 'zakonvremeni.ru') { const base_element = document.getElementsByClassName('item-page')[0]; const title = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'page-header', base_element), grub_text_func); const parent_category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'parent-category-name', base_element), grub_text_func); const category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'category-name', base_element), grub_text_func); const page = RemoveAfterSplitter(TrimString(document.getElementsByClassName('item-page')[0].querySelector('[itemprop=articleBody]').textContent), '.', true); content = title + '\n' + parent_category + ' ' + category + '\n\n' + page + '\n' + document.URL; source_add = false; } let result = ''; if (content.length > 0) { result = ''; } return result; } let content = MakeContent(); let logo = document.createElement("div"); logo.innerHTML = '
' + content + '
'; document.body.insertBefore(logo, document.body.firstChild); })();