// ==UserScript==
// @name News parser
// @namespace http://zakonvremeni.ru
// @version 0.2
// @description Parse news
// @author AlexeiBv+mirocod@narod.ru
// @match https://tass.ru/*
// @match https://ria.ru/*
// @match https://rg.ru/*
// @match https://zakonvremeni.ru/*
// @icon https://icons.duckduckgo.com/ip2/zakonvremeni.ru.ico
// @grant none
// ==/UserScript==
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru>
( function ( ) {
'use strict' ;
// Поиск элементов по регулярному выражению
function GetElementClassName ( a _Element ) {
return a _Element . className ;
}
function GetNodeName ( a _Element ) {
return a _Element . nodeName ;
}
function CheckRegExp ( a _GetElementNameFunc , a _RegExpPattern , a _Element ) {
let re = new RegExp ( "(?:^|\\s)" + a _RegExpPattern + "(?!\\S)" ) ;
return re . test ( a _GetElementNameFunc ( a _Element ) ) ;
}
function FindElementsByRegExp ( a _GetElementNameFunc , a _RegExpPattern , a _ElementParent ) {
a _ElementParent || ( a _ElementParent = document ) ;
let descendants = a _ElementParent . getElementsByTagName ( '*' ) , i = - 1 , e , result = [ ] ;
while ( e = descendants [ ++ i ] ) {
if ( CheckRegExp ( a _GetElementNameFunc , a _RegExpPattern , e ) ) {
result . push ( e ) ;
}
}
return result ;
}
// Работа со строками
function TrimString ( s ) {
return ( s || '' ) . replace ( /^\s+|\s+$/g , '' ) ;
}
function RemoveBeforeSplitter ( a _String , a _Splitter ) {
let index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
return a _String . substring ( index + a _Splitter . length ) ;
}
return a _String ;
}
function RemoveAfterSplitter ( a _String , a _Splitter , a _SaveSplitter ) {
let index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
let spl _len = a _Splitter . length
if ( ! a _SaveSplitter ) {
spl _len = 0 ;
}
return a _String . substring ( 0 , index + spl _len ) ;
}
return a _String ;
}
function ClearUrl ( a _Url ) {
const separator = '?' ;
return RemoveAfterSplitter ( a _Url , separator , false ) ;
}
function ClearTextFuncTemplate ( a _RemoveBeforeList ) {
function ClearTextFunc ( a _Content ) {
let content = a _Content ;
for ( let i = 0 ; i < a _RemoveBeforeList . length ; i ++ ) {
let r = a _RemoveBeforeList [ i ] ;
content = RemoveBeforeSplitter ( content , r ) ;
}
return content ;
}
return ClearTextFunc
}
// Работа с контейнерами
function GetImageInContainers ( a _Elements , a _TextAlign ) {
let i ;
let img _src = '' ;
let re = new RegExp ( "(https?:\/\/.*\.(?:png|jpg))" ) ;
for ( i in a _Elements ) {
let e = a _Elements [ i ] ;
let children = e . querySelectorAll ( "*" ) ;
for ( let i = 0 ; i < children . length ; i ++ ) {
var c = children [ i ] ;
if ( c . nodeName == 'IMG' && re . test ( c . src ) ) {
img _src = c . src ;
}
}
}
if ( img _src . length > 0 ) {
return '<p style = "text-align:' + a _TextAlign + ';"><img src = "' + img _src + '" width = "600px"/></p>' ;
}
return '' ;
}
function GetContentInContainers ( a _Elements , a _GrubTextFunc , a _FinishWorkFunc ) {
let result = '' ;
for ( var i in a _Elements ) {
let e = a _Elements [ i ] ;
let content = '' ;
if ( e . querySelectorAll ) {
var children = e . querySelectorAll ( "*" ) ;
if ( children . length == 0 || e . innerText ) {
content += a _GrubTextFunc ( e ) ;
}
else {
for ( let i = 0 ; i < children . length ; i ++ ) {
let c = children [ i ] ;
content += a _GrubTextFunc ( c ) ;
}
}
}
if ( a _FinishWorkFunc ) {
result += a _FinishWorkFunc ( content , e ) ;
}
else {
result += content ;
}
}
return result ;
}
// Фильтрация элементов
function FIlterElements ( a _Elements , a _ElementChecker ) {
let result = [ ] ;
for ( let i = 0 ; i < a _Elements . length ; i ++ ) {
let e = a _Elements [ i ] ;
if ( a _ElementChecker ( e ) ) {
result . push ( e ) ;
}
}
return result ;
}
function ElementCheckerTrue ( a _Element ) {
return true ;
}
function ElementCheckerRia ( a _Element ) {
if ( a _Element . dataset . type == 'article' || a _Element . dataset . type == 'banner' ) {
return false ;
}
return true ;
}
function ElementCheckerZV ( a _Element ) {
if ( a _Element . itemprop == 'articleBody' ) {
return true ;
}
return false ;
}
// Обработка элементов
function GrubTextFuncTemplate ( ) {
function GrubTextFunc ( a _Element ) {
var content = '' ;
if ( a _Element . innerText ) {
content = TrimString ( a _Element . textContent ) ;
}
return content ;
}
return GrubTextFunc
}
function FinishWorkFuncTemplate ( a _OutTag , a _TextAlign , a _ClearTextFunc ) {
function FinishWorkFunc ( a _Content , a _Element ) {
if ( a _ClearTextFunc ) {
a _Content = a _ClearTextFunc ( a _Content ) ;
}
if ( a _Element && a _Element . dataset && a _Element . dataset . type == 'list' ) {
let childrens = FindElementsByRegExp ( GetElementClassName , 'article__list-item' , a _Element ) ;
let content = '' ;
for ( let i = 0 ; i < childrens . length ; i ++ ) {
let c = childrens [ i ] ;
content += '<li>' + GrubTextFuncTemplate ( ) ( c ) + '</li>' ;
}
a _Content = '<ul>' + content + '</ul>' ;
}
if ( a _Element && CheckRegExp ( GetElementClassName , 'PageContentCommonStyling_text.*' , a _Element ) ) {
let content = '' ;
let childrens = FindElementsByRegExp ( GetNodeName , 'P' , a _Element ) ;
for ( let i = 0 ; i < childrens . length ; i ++ ) {
let c = childrens [ i ] ;
content += FinishWorkFunc ( GrubTextFuncTemplate ( ) ( c ) , c ) ;
}
return content
}
if ( a _OutTag && a _TextAlign ) {
a _Content = '<' + a _OutTag + ' style = "text-align:' + a _TextAlign + ';">' + a _Content + '</' + a _OutTag + '>' ;
}
if ( a _Element && a _Element . dataset && a _Element . dataset . type == 'quote' ) {
a _Content = '<blockquote>' + a _Content + '</blockquote>' ;
}
return a _Content ;
}
return FinishWorkFunc
}
// Создание контента для стандартных новостей
function MakeContentByNews ( a _BaseElementTitle , a _BaseElementImage , a _BaseElementText , a _TitleRegExpElementPattern , a _ImageRegExpElementPattern , a _TextRegExpElementPattern , a _ElementChecker , a _ClearTextPatterns ) {
const title _tag = 'h2' ;
const p _tag = 'p' ;
const title _finish _text _func = FinishWorkFuncTemplate ( title _tag , 'center' )
const grub _func = GrubTextFuncTemplate ( ) ;
let content = '' ;
const paragraph _finish _text _func = FinishWorkFuncTemplate ( p _tag , 'justify' , ClearTextFuncTemplate ( a _ClearTextPatterns ) ) ;
content += GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , a _TitleRegExpElementPattern , a _BaseElementTitle ) , grub _func , title _finish _text _func ) ;
content += GetImageInContainers ( FindElementsByRegExp ( GetElementClassName , a _ImageRegExpElementPattern , a _BaseElementImage ) , 'center' ) ;
content += GetContentInContainers ( FIlterElements ( FindElementsByRegExp ( GetElementClassName , a _TextRegExpElementPattern , a _BaseElementText ) , a _ElementChecker ) , grub _func , paragraph _finish _text _func ) ;
return content ;
}
// Создание контента для сайта
function MakeContent ( ) {
let content = '' ;
let source _add = true ;
const zero _tag _func = FinishWorkFuncTemplate ( )
const grub _text _func = GrubTextFuncTemplate ( )
if ( location . hostname == 'tass.ru' ) {
const base _element = document . getElementById ( 'content_box' ) ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element ,
'tass_pkg_title--variant_h1_default.*' ,
'Image_wrapper_.*' ,
'Paragraph_paragraph.*' ,
ElementCheckerTrue ,
[ '/ТАСС/. ' ]
) ;
}
else if ( location . hostname == 'ria.ru' ) {
const base _element = document . getElementsByClassName ( 'article__header' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
const tire = [ '–' , '—' , '‒' , '―' , '⸺' , '⸻' ] ;
let clear _text = [ ] ;
for ( let i in tire ) {
let t = tire [ i ] ;
clear _text . push ( t + ' РИА Новости. ' ) ;
}
content = MakeContentByNews (
base _element ,
base _element ,
base _element _text ,
'article__title' ,
'photoview__open' ,
'article__block' ,
ElementCheckerRia ,
clear _text
) ;
}
else if ( location . hostname == 'rg.ru' ) {
const base _element = document . getElementsByClassName ( 'article__header' ) [ 0 ] ;
const base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
content = MakeContentByNews (
document ,
document ,
base _element _text ,
'PageArticleContent_title.*' ,
'PageArticleContent_image.*' ,
'(PageContentCommonStyling_text|PageArticleContent_lead).*' ,
ElementCheckerRia ,
[ ]
) ;
}
else if ( location . hostname == 'zakonvremeni.ru' ) {
const base _element = document . getElementsByClassName ( 'item-page' ) [ 0 ] ;
const title = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'page-header' , base _element ) , grub _text _func ) ;
const parent _category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'parent-category-name' , base _element ) , grub _text _func ) ;
const category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'category-name' , base _element ) , grub _text _func ) ;
const page = RemoveAfterSplitter ( TrimString ( document . getElementsByClassName ( 'item-page' ) [ 0 ] . querySelector ( '[itemprop=articleBody]' ) . textContent ) , '.' , true ) ;
content = title + '\n' + parent _category + ' ' + category + '\n\n' + page + '\n' + document . URL ;
source _add = false ;
}
let result = '' ;
if ( content . length > 0 ) {
result = '<textarea id = "news_content" rows="10" cols="100">' + content ;
if ( source _add ) {
result += '<p style="text-align: justify;">Источник: <a href = "' + ClearUrl ( document . URL ) + '">' + location . hostname + '</a></p>' ;
}
result += '</textarea>' ;
}
return result ;
}
let content = MakeContent ( ) ;
let logo = document . createElement ( "div" ) ;
logo . innerHTML = '<div style="margin: 0pt auto; width: 800px; text-align: center;">' + content + '</div>' ;
document . body . insertBefore ( logo , document . body . firstChild ) ;
} ) ( ) ;