Mailing List Archive

[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Add collation for Abkhaz (ab)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/399537 )

Change subject: Add collation for Abkhaz (ab)
......................................................................


Add collation for Abkhaz (ab)

* Adding new class AbkhazUppercaseCollation, mapped to 'uppercase-ab'.
* Extended CustomUppercaseCollation with support for sorting digraphs
and for alphabets larger than 64 letters (up to 4096).

Bug: T183430
Change-Id: I16d44568e44d7ef5b39c38b1a6257b9fe10a34d4
---
M autoload.php
A includes/collation/AbkhazUppercaseCollation.php
M includes/collation/Collation.php
M includes/collation/CustomUppercaseCollation.php
M tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
5 files changed, 132 insertions(+), 17 deletions(-)

Approvals:
Brian Wolff: Looks good to me, approved
jenkins-bot: Verified



diff --git a/autoload.php b/autoload.php
index 6b8387b..47c04b9 100644
--- a/autoload.php
+++ b/autoload.php
@@ -6,6 +6,7 @@
$wgAutoloadLocalClasses = [.
'APCBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCBagOStuff.php',
'APCUBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCUBagOStuff.php',
+ 'AbkhazUppercaseCollation' => __DIR__ . '/includes/collation/AbkhazUppercaseCollation.php',
'AbstractContent' => __DIR__ . '/includes/content/AbstractContent.php',
'Action' => __DIR__ . '/includes/actions/Action.php',
'ActiveUsersPager' => __DIR__ . '/includes/specials/pagers/ActiveUsersPager.php',
diff --git a/includes/collation/AbkhazUppercaseCollation.php b/includes/collation/AbkhazUppercaseCollation.php
new file mode 100644
index 0000000..e0ea237
--- /dev/null
+++ b/includes/collation/AbkhazUppercaseCollation.php
@@ -0,0 +1,93 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.31
+ *
+ * @file
+ */
+
+class AbkhazUppercaseCollation extends CustomUppercaseCollation {
+
+ public function __construct() {
+ parent::__construct( [.
+ '?',
+ '?',
+ '?',
+ '?',
+ '??',
+ '??',
+ '?',
+ '??',
+ '??',
+ '?',
+ '??',
+ '?',
+ '?',
+ '??',
+ '??',
+ '?',
+ '?',
+ '??',
+ '?',
+ '?',
+ '??',
+ '??',
+ '?',
+ '??',
+ '??',
+ '?',
+ '??',
+ '??',
+ '?',
+ '?',
+ '?',
+ '?',
+ '?',
+ '?',
+ '?',
+ '?',
+ '?',
+ '??',
+ '?',
+ '??',
+ '?',
+ '?',
+ '?',
+ '??',
+ '??',
+ '?',
+ '??',
+ '?',
+ '??',
+ '?',
+ '??',
+ '?',
+ '?',
+ '?',
+ '?',
+ '?',
+ '??',
+ '??',
+ '?',
+ '?',
+ '?',
+ '??',
+ '?',
+ '?',
+ ], Language::factory( 'ab' ) );
+ }
+}
diff --git a/includes/collation/Collation.php b/includes/collation/Collation.php
index 7171a21..30cae5a 100644
--- a/includes/collation/Collation.php
+++ b/includes/collation/Collation.php
@@ -65,6 +65,8 @@
return new CollationEt;
case 'xx-uca-fa':
return new CollationFa;
+ case 'uppercase-ab':
+ return new AbkhazUppercaseCollation;
case 'uppercase-ba':
return new BashkirUppercaseCollation;
case 'uppercase-se':
diff --git a/includes/collation/CustomUppercaseCollation.php b/includes/collation/CustomUppercaseCollation.php
index 301972d..170d5c2 100644
--- a/includes/collation/CustomUppercaseCollation.php
+++ b/includes/collation/CustomUppercaseCollation.php
@@ -32,6 +32,7 @@
* conflicts with other people using private use area)
*
* This does not support fancy things like secondary differences, etc.
+ * (It supports digraphs, trigraphs etc. though.)
*
* It is expected most people will subclass this and just override the
* constructor to hard-code an alphabet.
@@ -45,25 +46,30 @@
private $puaSubset;

/**
- * @note This assumes $alphabet does not contain U+F3000-U+F303F
+ * @note This assumes $alphabet does not contain U+F3000-U+F3FFF
*
* @param array $alphabet Sorted array of uppercase characters.
* @param Language $lang What language for number sorting.
*/
public function __construct( array $alphabet, Language $lang ) {
- // It'd be trivial to extend this past 64, you'd just
- // need a bit of bit-fiddling. Doesn't seem necessary right
- // now.
- if ( count( $alphabet ) < 1 || count( $alphabet ) >= 64 ) {
- throw new UnexpectedValueException( "Alphabet must be < 64 items" );
+ if ( count( $alphabet ) < 1 || count( $alphabet ) >= 4096 ) {
+ throw new UnexpectedValueException( "Alphabet must be < 4096 items" );
}
- $this->alphabet = $alphabet;
+ $this->firstLetters = $alphabet;
+ // For digraphs, only the first letter is capitalized in input
+ $this->alphabet = array_map( [ $lang, 'uc' ], $alphabet );

$this->puaSubset = [];
$len = count( $alphabet );
for ( $i = 0; $i < $len; $i++ ) {
- $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 );
+ $this->puaSubset[] = "\xF3\xB3" . chr( floor( $i / 64 ) + 128 ) . chr( ( $i % 64 ) + 128 );
}
+
+ // Sort these arrays so that any trigraphs, digraphs etc. are first
+ // (and they get replaced first in convertToPua()).
+ $lengths = array_map( 'mb_strlen', $this->alphabet );
+ array_multisort( $lengths, SORT_DESC, $this->firstLetters, $this->alphabet, $this->puaSubset );
+
parent::__construct( $lang );
}

@@ -76,12 +82,17 @@
}

public function getFirstLetter( $string ) {
- // In case a title has a PUA code in it, make it sort
- // under the header for the character it would replace
- // to avoid inconsistent behaviour. This class mostly
- // assumes that people will not use PUA codes.
- return parent::getFirstLetter(
- str_replace( $this->puaSubset, $this->alphabet, $string )
- );
+ $sortkey = $this->getSortKey( $string );
+
+ // In case a title begins with a character from our alphabet, return the corresponding
+ // first-letter. (This also happens if the title has a corresponding PUA code in it, to avoid
+ // inconsistent behaviour. This class mostly assumes that people will not use PUA codes.)
+ $index = array_search( substr( $sortkey, 0, 4 ), $this->puaSubset );
+ if ( $index !== false ) {
+ return $this->firstLetters[ $index ];
+ }
+
+ // String begins with a character outside of our alphabet, fall back
+ return parent::getFirstLetter( $string );
}
}
diff --git a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
index 5d5317b..90c097d 100644
--- a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
+++ b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php
@@ -6,6 +6,7 @@
$this->collation = new CustomUppercaseCollation( [
'D',
'C',
+ 'Cs',
'B'
], Language::factory( 'en' ) );

@@ -31,6 +32,7 @@
[ '???? ', 'C', 'Test relocated to end' ],
[ 'c', 'b', 'lowercase' ],
[ 'x', 'z', 'lowercase original' ],
+ [ 'Cz', 'Cs', 'digraphs' ],
[ 'C50D', 'C100', 'Numbers' ]
];
}
@@ -50,8 +52,14 @@
[ 'afdsa', 'A' ],
[ "\xF3\xB3\x80\x80Foo", 'D' ],
[ "\xF3\xB3\x80\x81Foo", 'C' ],
- [ "\xF3\xB3\x80\x82Foo", 'B' ],
- [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ],
+ [ "\xF3\xB3\x80\x82Foo", 'Cs' ],
+ [ "\xF3\xB3\x80\x83Foo", 'B' ],
+ [ "\xF3\xB3\x80\x84Foo", "\xF3\xB3\x80\x84" ],
+ [ 'C', 'C' ],
+ [ 'Cz', 'C' ],
+ [ 'Cs', 'Cs' ],
+ [ 'CS', 'Cs' ],
+ [ 'cs', 'Cs' ],
];
}
}

--
To view, visit https://gerrit.wikimedia.org/r/399537
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I16d44568e44d7ef5b39c38b1a6257b9fe10a34d4
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Bartosz Dziewo?ski <matma.rex@gmail.com>
Gerrit-Reviewer: Bartosz Dziewo?ski <matma.rex@gmail.com>
Gerrit-Reviewer: Brian Wolff <bawolff+wn@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits