Browse Source

Версия 0.2 полностью обновлена. Функционал тот же.

master
Alexei 1 year ago
parent
commit
b8dc6c2792
  1. 207
      news_parser.js

207
news_parser.js

@ -1,7 +1,7 @@
// ==UserScript== // ==UserScript==
// @name News parser // @name News parser
// @namespace http://zakonvremeni.ru // @namespace http://zakonvremeni.ru
// @version 0.1 // @version 0.2
// @description Parse news // @description Parse news
// @author AlexeiBv+mirocod@narod.ru // @author AlexeiBv+mirocod@narod.ru
// @match https://tass.ru/* // @match https://tass.ru/*
@ -11,49 +11,32 @@
// @grant none // @grant none
// ==/UserScript== // ==/UserScript==
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_platform_bot@narod.ru> // Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru>
(function() { (function() {
'use strict'; 'use strict';
function getByClass (className, parent) { // Поиск элементов по регулярному выражению
parent || (parent=document);
var descendants=parent.getElementsByTagName('*'), i=-1, e, result=[]; function GetElementClassName(a_Element) {
var re = new RegExp("(?:^|\\s)" + className + "(?!\\S)"); return a_Element.className;
}
function FindElementsByRegExp(a_GenElementNameFunc, a_RegExpPattern, a_ElementParent) {
a_ElementParent || (a_ElementParent=document);
var descendants = a_ElementParent.getElementsByTagName('*'), i=-1, e, result=[];
var re = new RegExp("(?:^|\\s)" + a_RegExpPattern + "(?!\\S)");
while (e=descendants[++i]) { while (e=descendants[++i]) {
if (re.test(e.className)){ if (re.test(a_GenElementNameFunc(e))){
result.push(e); result.push(e);
} }
} }
return result; return result;
} }
function GetImageInContainers(baseClass, parent, textAlign) { // Работа со строками
var elems = getByClass(baseClass, parent);
if (!elems) {
return '';
}
var i; function TrimString(s) {
var img_src = '';
var re = new RegExp("(https?:\/\/.*\.(?:png|jpg))");
for (i in elems) {
var e = elems[i];
var children = e.querySelectorAll("*");
for(let i = 0; i < children.length; i++){
var c = children[i];
if (c.nodeName == 'IMG' && re.test(c.src)) {
img_src = c.src;
}
}
}
if (img_src.length > 0) {
return '<p style = "text-align:' + textAlign + ';"><img src = "'+ img_src + '" width = "600px"/></p>';
}
return '';
}
function Trim(s) {
return ( s || '' ).replace( /^\s+|\s+$/g, '' ); return ( s || '' ).replace( /^\s+|\s+$/g, '' );
} }
@ -77,8 +60,11 @@
return a_String; return a_String;
} }
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign) { function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) {
function FinishWorkFunc(a_Content, a_Element) { function FinishWorkFunc(a_Content, a_Element) {
if (a_ClearTextFunc) {
a_Content = a_ClearTextFunc(a_Content);
}
if (a_OutTag && a_TextAlign) { if (a_OutTag && a_TextAlign) {
return '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + a_Content + '</' + a_OutTag + '>'; return '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + a_Content + '</' + a_OutTag + '>';
} }
@ -89,18 +75,49 @@
return FinishWorkFunc return FinishWorkFunc
} }
function GetContentInContainers(a_FinishWorkFunc, a_GrubTextFunc, baseClass, parent, a_ElementFilterFunc, a_ClearTextFunc) { function ClearUrl(a_Url) {
var elems = getByClass(baseClass, parent); var separator = '?';
if (!elems) { return RemoveAfterSplitter(a_Url, separator, false);
return 'Не удалось найти ' + baseClass;
} }
var result = ''; function ClearTextFuncTemplate(a_RemoveBeforeList) {
for (var i in elems) { function ClearTextFunc(a_Content) {
var e = elems[i]; var content = a_Content;
if (a_ElementFilterFunc && !a_ElementFilterFunc(e)) { for (let i = 0; i < a_RemoveBeforeList.length; i++) {
continue; var r = a_RemoveBeforeList[i];
content = RemoveBeforeSplitter(content, r);
}
return content;
}
return ClearTextFunc
}
// Работа с контейнерами
function GetImageInContainers(a_Elements, a_TextAlign) {
var i;
var img_src = '';
var re = new RegExp("(https?:\/\/.*\.(?:png|jpg))");
for (i in a_Elements) {
var e = a_Elements[i];
var children = e.querySelectorAll("*");
for(let i = 0; i < children.length; i++){
var c = children[i];
if (c.nodeName == 'IMG' && re.test(c.src)) {
img_src = c.src;
}
}
}
if (img_src.length > 0) {
return '<p style = "text-align:' + a_TextAlign + ';"><img src = "'+ img_src + '" width = "600px"/></p>';
} }
return '';
}
function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) {
var result = '';
for (var i in a_Elements) {
var e = a_Elements[i];
var content = ''; var content = '';
if (e.querySelectorAll) { if (e.querySelectorAll) {
@ -115,9 +132,6 @@
} }
} }
} }
if (a_ClearTextFunc) {
content = a_ClearTextFunc(content);
}
if (a_FinishWorkFunc) { if (a_FinishWorkFunc) {
result += a_FinishWorkFunc(content, e); result += a_FinishWorkFunc(content, e);
@ -129,76 +143,107 @@
return result; return result;
} }
function ClearUrl(a_Url) { // Фильтрация элементов
var separator = '?';
return RemoveAfterSplitter(a_Url, separator, false); function FIlterElements(a_Elements, a_ElementChecker) {
var result = [];
for (let i = 0; i < a_Elements.length; i++) {
var e = a_Elements[i];
if (a_ElementChecker(e)) {
result.push(e);
}
} }
function FIlterTrue(element) { return result;
return true;
} }
function FIlterRia(element) { function ElementCheckerTrue(a_Element) {
if (element.dataset.type == 'text' || element.dataset.type == 'quote' || element.dataset.type == 'list') {
return true; return true;
} }
return false;
}
function FIlterZV(element) { function ElementCheckerRia(a_Element) {
if (element.itemprop == 'articleBody') { if (a_Element.dataset.type == 'text' || a_Element.dataset.type == 'quote' || a_Element.dataset.type == 'list') {
return true; return true;
} }
return false; return false;
} }
function ClearTextFuncTemplate(a_RemoveBeforeList) { function ElementCheckerZV(a_Element) {
function ClearTextFunc(a_Content) { if (a_Element.itemprop == 'articleBody') {
var content = a_Content; return true;
for (let i = 0; i < a_RemoveBeforeList.length; i++) {
var r = a_RemoveBeforeList[i];
content = RemoveBeforeSplitter(content, r);
}
return content;
} }
return ClearTextFunc return false;
} }
function GrubTextFuncTemplate() { function GrubTextFuncTemplate() {
function GrubTextFunc(a_Element) { function GrubTextFunc(a_Element) {
var content = ''; var content = '';
if (a_Element.innerText) { if (a_Element.innerText) {
content = Trim(a_Element.textContent); content = TrimString(a_Element.textContent);
} }
return content; return content;
} }
return GrubTextFunc return GrubTextFunc
} }
function MakeContent() { // Создание контента для стандартных новостей
var content = '';
function MakeContentByNews(a_BaseElementTitle, a_BaseElementImage, a_BaseElementText, a_TitleRegExpElementPattern, a_ImageRegExpElementPattern, a_TextRegExpElementPattern, a_ElementChecker, a_ClearTextPatterns) {
var title_tag = 'h2'; var title_tag = 'h2';
var p_tag = 'p'; var p_tag = 'p';
var zero_tag = ''; var title_finish_text_func = FinishWorkFuncTemplate(title_tag, 'center')
var grub_text_func = GrubTextFuncTemplate()
var content = '';
var paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns));
content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_text_func, title_finish_text_func);
content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center');
content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_text_func, paragraph_finish_text_func);
return content;
}
// Создание контента для сайта
function MakeContent() {
var content = '';
var source_add = true; var source_add = true;
var title_func = FinishWorkFuncTemplate(title_tag, 'center')
var paragraph_func = FinishWorkFuncTemplate(p_tag, 'justify')
var zero_tag_func = FinishWorkFuncTemplate() var zero_tag_func = FinishWorkFuncTemplate()
var grub_text_func = GrubTextFuncTemplate()
if (location.hostname == 'tass.ru') { if (location.hostname == 'tass.ru') {
content += GetContentInContainers(title_func, GrubTextFuncTemplate(), 'tass_pkg_title--variant_h1_default.*', document.getElementById('content_box')); let base_element = document.getElementById('content_box');
content += GetImageInContainers('Image_wrapper_.*', document.getElementById('content_box'), 'center'); content = MakeContentByNews(
content += GetContentInContainers(paragraph_func, GrubTextFuncTemplate(), 'Paragraph_paragraph.*', document.getElementById('content_box'), FIlterTrue, ClearTextFuncTemplate(['/ТАСС/. '])); base_element,
base_element,
base_element,
'tass_pkg_title--variant_h1_default.*',
'Image_wrapper_.*',
'Paragraph_paragraph.*',
ElementCheckerTrue,
['/ТАСС/. ']
);
} }
else if (location.hostname == 'ria.ru') { else if (location.hostname == 'ria.ru') {
content += GetContentInContainers(title_func, GrubTextFuncTemplate(), title_tag, document.getElementsByClassName('article__header')[0]); let base_element = document.getElementsByClassName('article__header')[0];
content += GetImageInContainers('photoview__open', document.getElementsByClassName('article__header')[0], 'center'); var base_element_text = document.getElementsByClassName('article__body')[0];
content += GetContentInContainers(paragraph_func, GrubTextFuncTemplate(), p_tag, 'article__block', document.getElementsByClassName('article__body')[0], 'justify', FIlterRia, ClearTextFuncTemplate(['– РИА Новости. ', '— РИА Новости. '])); content = MakeContentByNews(
base_element,
base_element,
base_element_text,
'article__title',
'photoview__open',
'article__block',
ElementCheckerRia,
['– РИА Новости. ', '— РИА Новости. ']
);
} }
else if (location.hostname == 'zakonvremeni.ru') { else if (location.hostname == 'zakonvremeni.ru') {
var title = GetContentInContainers(zero_tag_func, GrubTextFuncTemplate(), 'page-header', document.getElementsByClassName('item-page')[0]); let base_element = document.getElementsByClassName('item-page')[0];
var parent_category = GetContentInContainers(zero_tag_func, GrubTextFuncTemplate(), 'parent-category-name', document.getElementsByClassName('item-page')[0]); var title = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'page-header', base_element), grub_text_func);
var category = GetContentInContainers(zero_tag_func, GrubTextFuncTemplate(), 'category-name', document.getElementsByClassName('item-page')[0]); var parent_category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'parent-category-name', base_element), grub_text_func);
var page = RemoveAfterSplitter(Trim(document.getElementsByClassName('item-page')[0].querySelector('[itemprop=articleBody]').textContent), '.', true); var category = GetContentInContainers(FindElementsByRegExp(GetElementClassName, 'category-name', base_element), grub_text_func);
var page = RemoveAfterSplitter(TrimString(document.getElementsByClassName('item-page')[0].querySelector('[itemprop=articleBody]').textContent), '.', true);
content = title + '\n' + parent_category + ' ' + category + '\n\n' + page + '\n' + document.URL; content = title + '\n' + parent_category + ' ' + category + '\n\n' + page + '\n' + document.URL;
source_add = false; source_add = false;
} }

Loading…
Cancel
Save