MediaWiki:Gadget-OCR-tools.js

Материал из Викитеки — свободной библиотеки

Замечание: Возможно, после публикации вам придётся очистить кэш своего браузера, чтобы увидеть изменения.

  • Firefox / Safari: Удерживая клавишу Shift, нажмите на панели инструментов Обновить либо нажмите Ctrl+F5 или Ctrl+R (⌘+R на Mac)
  • Google Chrome: Нажмите Ctrl+Shift+R (⌘+Shift+R на Mac)
  • Internet Explorer / Edge: Удерживая Ctrl, нажмите Обновить либо нажмите Ctrl+F5
  • Opera: Нажмите Ctrl+F5.
	function ocr_1() {
		var textArea = $('#wpTextbox1');

		var txt = textArea.textSelection('getSelection');
		if (txt === '') {
			txt = textArea.textSelection('getContents');
			txt = txt.replace(/^<noinclude>[\s\S]+?<\/noinclude>([\s\S]+)<noinclude>[\s\S]+?<\/noinclude>$/g, '$1'); // Fix 25-02-2017 by @Lozman
			txt = ocr_1_inner(txt, 1);
			textArea.textSelection('setContents', txt);
		} else {
			txt = ocr_1_inner(txt, 0);
			textArea.textSelection('encapsulateSelection', {pre: txt, replace: true});
		}
	}

	function ocr_1_inner(sel, r) {
		// manipulate the text in the sel variable
		sel = sel.replace(/[ \t]+$/gm, '');  // remove trailing spaces
		sel = sel.replace(/^[ \t]+/gm, '');  // remove leading spaces
		sel = sel.replace(/[ \t]+/gm, ' ');  // remove multiple spaces

		sel = sel.replace(/(\S)[\-¬\u00AD]\n(\S)/gm, '$1$2'); // merge lines in the paragraph separated with dash into single line
		sel = sel.replace(/(\S)\n(\S)/gm, '$1 $2'); // merge other lines in the paragraph into single line

		if (r) { // remove leading and trailing line feeds
			sel = sel.replace(/\n/gm, '\x01'); // hide \n's

			sel = sel.replace(/^\x01+/gm, ''); // remove leading line feeds
			sel = sel.replace(/\x01+$/gm, ''); // remove trailing line feeds

			sel = sel.replace(/\x01/gm, '\n'); // restore hidden \n's
		}

		return sel;
	}

	function ocr_2() {
		var textArea = $('#wpTextbox1');

		txt = textArea.textSelection('getContents');
		txt = txt.replace(/^<noinclude>[\s\S]+?<\/noinclude>([\s\S]+)<noinclude>[\s\S]+?<\/noinclude>$/g, '$1'); // Fix 25-02-2017 by @Lozman
		txt = ocr_2_inner(txt);
		textArea.textSelection('setContents', txt);
	}

	function ocr_2_inner(sel) {
		// manipulate the text in the sel variable
		sel = sel.replace(/\n/gm, '\x01'); // hide \n's

		sel = sel.replace(/^(\x01*)([\-a-zA-Z0-9А-Яа-яёЁІіѢѣѲѳѴѵ]+)/gm, '$1{{Перенос2|…|$2}}'); // 

		sel = sel.replace(/\x01/gm, '\n'); // restore hidden \n's

		return sel;
	}

	function ocr_3() {
		var textArea = $('#wpTextbox1');

		txt = textArea.textSelection('getContents');
		txt = txt.replace(/^<noinclude>[\s\S]+?<\/noinclude>([\s\S]+)<noinclude>[\s\S]+?<\/noinclude>$/g, '$1'); // Fix 25-02-2017 by @Lozman
		txt = ocr_3_inner(txt);
		textArea.textSelection('setContents', txt);
	}

	function ocr_3_inner(sel) {
		// manipulate the text in the sel variable
		sel = sel.replace(/\n/gm, '\x01'); // hide \n's

		sel = sel.replace(/([\-a-zA-Z0-9А-Яа-яёЁІіѢѣѲѳѴѵ]*[a-zA-Z0-9А-Яа-яёЁІіѢѣѲѳѴѵ])[-¬\u00AD]?(\x01*<noinclude>|$)/gm, '{{Перенос|$1|…}}$2'); // 

		sel = sel.replace(/\x01/gm, '\n'); // restore hidden \n's

		return sel;
	}

	function ocr_4() {
		var textArea = $('#wpTextbox1');
//		var len = 0;

		var txt = textArea.textSelection('getSelection');
		if (txt === '') {
			txt = textArea.textSelection('getContents');
			txt = txt.replace(/^<noinclude>[\s\S]+?<\/noinclude>([\s\S]+)<noinclude>[\s\S]+?<\/noinclude>$/g, '$1'); // Fix 25-02-2017 by @Lozman
			len = txt.length;
			txt = ocr_4_inner(txt);
			textArea.textSelection('setContents', txt);
		} else {
			len = txt.length;
			txt = ocr_4_inner(txt);
			textArea.textSelection('encapsulateSelection', {pre: txt, replace: true});
		}

		// see implementation of the resulting string in ocr_4_inner() to calculate offsets
		// 6 == length of '{{ВАР|'; 2 == length of '\n|'; 3 == length of '\n}}'...
		
		// This's also correct and calculated without the len variable. However, this approach is trickier than used one line below:
		var n1 = (((txt.length - (6 + 2 + 3)) / 2) + 6 + 2) - 0;
//		var n1 = (len + 6 + 2) - 0;
		var n2 = (txt.length - (3 - 1)) - 0;

		textArea.textSelection('setSelection', {start: n1, end: n2});

		textArea.textSelection('scrollToCaretPosition');
	}

	function ocr_4_inner(sel) {
		// manipulate the text in the sel variable
		
		sel = '{{ВАР|' + sel + '<!--\n-->|<!--\n-->' + sel + '\n}}';

		return sel;
	}

var lang = mw.config.get( 'wgContentLanguage' );

function ocr5_disable_input(set)
{
	if (set) {
		$(document).keyup(function(e) {
			// You can press ESC to kill the query
			if (e.which == 27) { ocr5_disable_input(false); }
		});
	}

	set ? $('#ocr_tools_5').off('click') : $('#ocr_tools_5').on('click', ocr_5);

	$('#wpTextbox1').prop('disabled', set);
}

function ocr5_callback (data) {
	if (data.error) {
		ocr5_disable_input (false);
		/*do_ocr();*/ // Do not fallback to tesseract for Russian Wikisource!
		return;
	} else {
		// Checking if tb is disabled is required with chrome as ESC doesn't kill
		// the query.
		var tb = document.getElementById("wpTextbox1");
		if (tb.disabled) {
			localStorage.ws_hOCR = data.text;

			var text = $(data.text).text();
			// Ugly as hell.
			text = text.replace(/[ ]*\n[ ]*/g, '\n')
				.replace(/\n\n\n\n/g, '@_@_@_@_@_@')
				.replace(/\n\n/g, '\n')
				.replace(/@_@_@_@_@_@/g, '\n\n')
				.replace(/\n\n\n/g, '\n\n');
			tb.value = $.trim(text);
		}
	}

	ocr5_disable_input(false);
}

function ocr_5 () {
	ocr5_disable_input(true);

//	lang = 'ru'; // test, tmp
	var request_url = '//tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr&book='
		+ encodeURIComponent(mw.config.get('wgTitle')) + '&lang=' + lang + '&user=' + mw.config.get('wgUserName');

	$.getJSON(request_url).done(ocr5_callback);
}
	
function ocr_test () {
	var textArea = $('#wpTextbox1');

	txt = textArea.textSelection('getContents');
	txt = txt.replace(/^<noinclude>[\s\S]+?<\/noinclude>([\s\S]+)<noinclude>[\s\S]+?<\/noinclude>$/g, '$1'); // Fix 25-02-2017 by @Lozman
	txt = 'test';
	textArea.textSelection('setContents', txt);
}

function ocr_6() {
	var textArea = $('#wpTextbox1');

	var txt = textArea.textSelection('getSelection');
	if (txt === '') {
		// nothing is selected
	} else {
		txt = ocr_6_inner(txt, 0);
		textArea.textSelection('encapsulateSelection', {pre: txt, replace: true});
	}
}

function ocr_6_inner(sel) {
	// manipulate the text in the sel variable
	sel = sel.replace(/[ \t]+/gm, '');  // remove spaces

	return sel;
}