|
|
|
@ -1,7 +1,7 @@
|
|
|
|
|
// ==UserScript==
|
|
|
|
|
// @name News parser
|
|
|
|
|
// @namespace http://zakonvremeni.ru
|
|
|
|
|
// @version 0.1
|
|
|
|
|
// @version 0.2
|
|
|
|
|
// @description Parse news
|
|
|
|
|
// @author AlexeiBv+mirocod@narod.ru
|
|
|
|
|
// @match https://tass.ru/*
|
|
|
|
@ -11,49 +11,32 @@
|
|
|
|
|
// @grant none
|
|
|
|
|
// ==/UserScript==
|
|
|
|
|
|
|
|
|
|
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_platform_bot@narod.ru>
|
|
|
|
|
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru>
|
|
|
|
|
|
|
|
|
|
(function() { |
|
|
|
|
'use strict'; |
|
|
|
|
|
|
|
|
|
function getByClass (className, parent) { |
|
|
|
|
parent || (parent=document); |
|
|
|
|
var descendants=parent.getElementsByTagName('*'), i=-1, e, result=[]; |
|
|
|
|
var re = new RegExp("(?:^|\\s)" + className + "(?!\\S)"); |
|
|
|
|
// Поиск элементов по регулярному выражению
|
|
|
|
|
|
|
|
|
|
function GetElementClassName(a_Element) { |
|
|
|
|
return a_Element.className; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function FindElementsByRegExp(a_GenElementNameFunc, a_RegExpPattern, a_ElementParent) { |
|
|
|
|
a_ElementParent || (a_ElementParent=document); |
|
|
|
|
var descendants = a_ElementParent.getElementsByTagName('*'), i=-1, e, result=[]; |
|
|
|
|
var re = new RegExp("(?:^|\\s)" + a_RegExpPattern + "(?!\\S)"); |
|
|
|
|
while (e=descendants[++i]) { |
|
|
|
|
if (re.test(e.className)){ |
|
|
|
|
if (re.test(a_GenElementNameFunc(e))){ |
|
|
|
|
result.push(e); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
return result; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function GetImageInContainers(baseClass, parent, textAlign) { |
|
|
|
|
var elems = getByClass(baseClass, parent); |
|
|
|
|
if (!elems) { |
|
|
|
|
return ''; |
|
|
|
|
} |
|
|
|
|
// Работа со строками
|
|
|
|
|
|
|
|
|
|
var i; |
|
|
|
|
var img_src = ''; |
|
|
|
|
var re = new RegExp("(https?:\/\/.*\.(?:png|jpg))"); |
|
|
|
|
for (i in elems) { |
|
|
|
|
var e = elems[i]; |
|
|
|
|
var children = e.querySelectorAll("*"); |
|
|
|
|
for(let i = 0; i < children.length; i++){ |
|
|
|
|
var c = children[i]; |
|
|
|
|
if (c.nodeName == 'IMG' && re.test(c.src)) { |
|
|
|
|
img_src = c.src; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (img_src.length > 0) { |
|
|
|
|
return '<p style = "text-align:' + textAlign + ';"><img src = "'+ img_src + '" width = "600px"/></p>'; |
|
|
|
|
} |
|
|
|
|
return ''; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function Trim(s) { |
|
|
|
|
function TrimString(s) { |
|
|
|
|
return ( s || '' ).replace( /^\s+|\s+$/g, '' ); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -77,8 +60,11 @@
|
|
|
|
|
return a_String; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign) { |
|
|
|
|
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) { |
|
|
|
|
function FinishWorkFunc(a_Content, a_Element) { |
|
|
|
|
if (a_ClearTextFunc) { |
|
|
|
|
a_Content = a_ClearTextFunc(a_Content); |
|
|
|
|
} |
|
|
|
|
if (a_OutTag && a_TextAlign) { |
|
|
|
|
return '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + a_Content + '</' + a_OutTag + '>'; |
|
|
|
|
} |
|
|
|
@ -89,18 +75,49 @@
|
|
|
|
|
return FinishWorkFunc |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function GetContentInContainers(a_FinishWorkFunc, a_GrubTextFunc, baseClass, parent, a_ElementFilterFunc, a_ClearTextFunc) { |
|
|
|
|
var elems = getByClass(baseClass, parent); |
|
|
|
|
if (!elems) { |
|
|
|
|
return 'Не удалось найти ' + baseClass; |
|
|
|
|
function ClearUrl(a_Url) { |
|
|
|
|
var separator = '?'; |
|
|
|
|
return RemoveAfterSplitter(a_Url, separator, false); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function ClearTextFuncTemplate(a_RemoveBeforeList) { |
|
|
|
|
function ClearTextFunc(a_Content) { |
|
|
|
|
var content = a_Content; |
|
|
|
|
for (let i = 0; i < a_RemoveBeforeList.length; i++) { |
|
|
|
|
var r = a_RemoveBeforeList[i]; |
|
|
|
|
content = RemoveBeforeSplitter(content, r); |
|
|
|
|
} |
|
|
|
|
return content; |
|
|
|
|
} |
|
|
|
|
return ClearTextFunc |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
var result = ''; |
|
|
|
|
for (var i in elems) { |
|
|
|
|
var e = elems[i]; |
|
|
|
|
if (a_ElementFilterFunc && !a_ElementFilterFunc(e)) { |
|
|
|
|
continue; |
|
|
|
|
// Работа с контейнерами
|
|
|
|
|
|
|
|
|
|
function GetImageInContainers(a_Elements, a_TextAlign) { |
|
|
|
|
var i; |
|
|
|
|
var img_src = ''; |
|
|
|
|
var re = new RegExp("(https?:\/\/.*\.(?:png|jpg))"); |
|
|
|
|
for (i in a_Elements) { |
|
|
|
|
var e = a_Elements[i]; |
|
|
|
|
var children = e.querySelectorAll("*"); |
|
|
|
|
for(let i = 0; i < children.length; i++){ |
|
|
|
|
var c = children[i]; |
|
|
|
|
if (c.nodeName == 'IMG' && re.test(c.src)) { |
|
|
|
|
img_src = c.src; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (img_src.length > 0) { |
|
|
|
|
return '<p style = "text-align:' + a_TextAlign + ';"><img src = "'+ img_src + '" width = "600px"/></p>'; |
|
|
|
|
} |
|
|
|
|
return ''; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) { |
|
|
|
|
var result = ''; |
|
|
|
|
for (var i in a_Elements) { |
|
|
|
|
var e = a_Elements[i]; |
|
|
|
|
|
|
|
|
|
var content = ''; |
|
|
|
|
if (e.querySelectorAll) { |
|
|
|
@ -115,9 +132,6 @@
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (a_ClearTextFunc) { |
|
|
|
|
content = a_ClearTextFunc(content); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if (a_FinishWorkFunc) { |
|
|
|
|
result += a_FinishWorkFunc(content, e); |
|
|
|
@ -129,76 +143,107 @@
|
|
|
|
|
return result; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function ClearUrl(a_Url) { |
|
|
|
|
var separator = '?'; |
|
|
|
|
return RemoveAfterSplitter(a_Url, separator, false); |
|
|
|
|
// Фильтрация элементов
|
|
|
|
|
|
|
|
|
|
function FIlterElements(a_Elements, a_ElementChecker) { |
|
|
|
|
var result = []; |
|
|
|
|
for (let i = 0; i < a_Elements.length; i++) { |
|
|
|
|
var e = a_Elements[i]; |
|
|
|
|
if (a_ElementChecker(e)) { |
|
|
|
|
result.push(e); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return result; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function FIlterTrue(element) { |
|
|
|
|
function ElementCheckerTrue(a_Element) { |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function FIlterRia(element) { |
|
|
|
|
if (element.dataset.type == 'text' || element.dataset.type == 'quote' || element.dataset.type == 'list') { |
|
|
|
|
function ElementCheckerRia(a_Element) { |
|
|
|
|
if (a_Element.dataset.type == 'text' || a_Element.dataset.type == 'quote' || a_Element.dataset.type == 'list') { |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function FIlterZV(element) { |
|
|
|
|
if (element.itemprop == 'articleBody') { |
|
|
|
|
function ElementCheckerZV(a_Element) { |
|
|
|
|
if (a_Element.itemprop == 'articleBody') { |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function ClearTextFuncTemplate(a_RemoveBeforeList) { |
|
|
|
|
function ClearTextFunc(a_Content) { |
|
|
|
|
var content = a_Content; |
|
|
|
|
for (let i = 0; i < a_RemoveBeforeList.length; i++) { |
|
|
|
|
var r = a_RemoveBeforeList[i]; |
|
|
|
|
content = RemoveBeforeSplitter(content, r); |
|
|
|
|
} |
|
|
|
|
return content; |
|
|
|
|
} |
|
|
|
|
return ClearTextFunc |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function GrubTextFuncTemplate() { |
|
|
|
|
function GrubTextFunc(a_Element) { |
|
|
|
|
var content = ''; |
|
|
|
|
if (a_Element.innerText) { |
|
|
|
|
content = Trim(a_Element.textContent); |
|
|
|
|
content = TrimString(a_Element.textContent); |
|
|
|
|
} |
|
|
|
|
return content; |
|
|
|
|
} |
|
|
|
|
return GrubTextFunc |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
function MakeContent() { |
|
|
|
|
var content = ''; |
|
|
|
|
// Создание контента для стандартных новостей
|
|
|
|
|
|
|
|
|
|
function MakeContentByNews(a_BaseElementTitle, a_BaseElementImage, a_BaseElementText, a_TitleRegExpElementPattern, a_ImageRegExpElementPattern, a_TextRegExpElementPattern, a_ElementChecker, a_ClearTextPatterns) { |
|
|
|
|
var title_tag = 'h2'; |
|
|
|
|
var p_tag = 'p'; |
|
|
|
|
var zero_tag = ''; |
|
|
|
|
var title_finish_text_func = FinishWorkFuncTemplate(title_tag, 'center') |
|
|
|
|
var grub_text_func = GrubTextFuncTemplate() |
|
|
|
|
|
|
|
|
|
var content = ''; |
|
|
|
|
var paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns)); |
|
|
|
|
content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_text_func, title_finish_text_func); |
|
|
|
|
content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center'); |
|
|
|
|
content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_text_func, paragraph_finish_text_func); |
|
|
|
|
return content; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Создание контента для сайта
|
|
|
|
|
|
|
|
|
|
function MakeContent() { |
|
|
|
|
var content = ''; |
|
|
|
|
var source_add = true; |
|
|
|
|
var title_func = FinishWorkFuncTemplate(title_tag, 'center') |
|
|
|
|
var paragraph_func = FinishWorkFuncTemplate(p_tag, 'justify') |
|
|
|
|
var zero_tag_func = FinishWorkFuncTemplate() |
|
|
|
|
var grub_text_func = GrubTextFuncTemplate() |
|
|
|
|
|
|
|
|
|
if (location.hostname == 'tass.ru') { |
|
|
|
|
content += GetContentInContainers(title_func, GrubTextFuncTemplate(), 'tass_pkg_title--variant_h1_default.*', document.getElementById('content_box')); |
|
|
|
|
content += GetImageInContainers('Image_wrapper_.*', document.getElementById('content_box'), 'center'); |
|
|
|
|
content += GetContentInContainers(paragraph_func, GrubTextFuncTemplate(), 'Paragraph_paragraph.*', document.getElementById('content_box'), FIlterTrue, ClearTextFuncTemplate(['/ТАСС/. '])); |
|
|
|
|
let base_element = document.getElementById('content_box'); |
|
|
|
|
content = MakeContentByNews( |
|
|
|
|
base_element, |
|
|
|
|
base_element, |
|
|
|
|
base_element, |
|
|
|
|
'tass_pkg_title--variant_h1_default.*', |
|
|
|
|
'Image_wrapper_.*', |
|
|
|
|
'Paragraph_paragraph.*', |
|
|
|
|
ElementCheckerTrue, |
|
|
|
|
['/ТАСС/. '] |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
else if (location.hostname == 'ria.ru') { |
|
|
|
|
content += GetContentInContainers(title_func, GrubTextFuncTemplate(), title_tag, document.getElementsByClassName('article__header')[0]); |
|
|
|
|
content += GetImageInContainers('photoview__open', document.getElementsByClassName('article__header')[0], 'center'); |
|
|
|
|
content += GetContentInContainers(paragraph_func, GrubTextFuncTemplate(), p_tag, 'article__block', document.getElementsByClassName('article__body')[0], 'justify', FIlterRia, ClearTextFuncTemplate(['– РИА Новости. ', '— РИА Новости. '])); |
|
|
|
|
let base_element = document.getElementsByClassName('article__header')[0]; |
|
|
|
|
var base_element_text = document.getElementsByClassName('article__body')[0]; |
|
|
|
|
content = MakeContentByNews( |
|
|
|
|
base_element, |
|
|
|
|
base_element, |
|
|
|
|
base_element_text, |
|
|
|
|
'article__title', |
|
|
|
|
'photoview__open', |
|
|
|
|
'article__block', |
|
|
|
|
ElementCheckerRia, |
|
|
|
|
['– РИА Новости. ', '— РИА Новости. '] |
|
|
|
|
); |
|
|
|
|
} |
|
|
|
|
else if (location.hostname == 'zakonvremeni.ru') { |
|
|
|
|
var title = GetContentInContainers(zero_tag_func, GrubTextFuncTemplate(), 'page-header', document.getElementsByClassName('item-page')[0]); |
|
|
|
|
var parent_category = GetContentInContainers(zero_tag_func, GrubTextFuncTemplate(), 'parent-category-name', document.getElementsByClassName('item-page')[0]); |
|
|
|
|
var category = GetContentInContainers(zero_tag_func, GrubTextFuncTemplate(), 'category-name', document.getElementsByClassName('item-page')[0]); |
|
|
|
|
var page = RemoveAfterSplitter(Trim(document.getElementsByClassName('item-page')[0].querySelector('[itemprop=articleBody]').textContent), '.', true); |
|
|
|
|
let base_element = document.getElementsByClassName('item-page')[0]; |
|
|
|
|
var title = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'page-header', base_element), grub_text_func); |
|
|
|
|
var parent_category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'parent-category-name', base_element), grub_text_func); |
|
|
|
|
var category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'category-name', base_element), grub_text_func); |
|
|
|
|
var page = RemoveAfterSplitter(TrimString(document.getElementsByClassName('item-page')[0].querySelector('[itemprop=articleBody]').textContent), '.', true); |
|
|
|
|
content = title + '\n' + parent_category + ' ' + category + '\n\n' + page + '\n' + document.URL; |
|
|
|
|
source_add = false; |
|
|
|
|
} |
|
|
|
|