fixed prefixed links (for arabic wikipedia)
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 class Tokenizer {
3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mQueuedToken; # Tokens that were already found, but not
7 # returned yet.
9 /* private */ function Tokenizer()
10 {
11 global $wgLang;
13 $this->mPos=0;
14 $this->mTokenQueue=array();
15 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
16 }
18 # factory function
19 function newFromString( $s )
20 {
21 $fname = "Tokenizer::newFromString";
22 wfProfileIn( $fname );
24 $t = new Tokenizer();
25 $t->mText = $s;
26 $t->mTextLength = strlen( $s );
28 wfProfileOut( $fname );
29 return $t;
30 }
33 // Return the next token, but do not increase the pointer. The next call
34 // to previewToken or nextToken will return the same token again.
35 // Actually, the pointer is increased, but the token is queued. The next
36 // call to previewToken or nextToken will check the queue and return
37 // the stored token.
38 function previewToken()
39 {
40 $fname = "Tokenizer::previewToken";
41 wfProfileIn( $fname );
43 if ( count( $this->mQueuedToken ) != 0 ) {
44 // still one token from the last round around. Return that one first.
45 $token = $this->mQueuedToken[0];
46 } else {
47 $token = $this->nextToken();
48 array_unshift( $this->mQueuedToken, $token );
49 }
51 wfProfileOut( $fname );
52 return $token;
53 }
56 // get the next token
57 // proceeds character by character through the text, looking for characters needing
58 // special attention. Those are currently: I, R, ', [, ], newline
59 //
60 // TODO: handling of French blanks not yet implemented
61 function nextToken()
62 {
63 $fname = "Tokenizer::nextToken";
64 wfProfileIn( $fname );
66 if ( count( $this->mQueuedToken ) != 0 ) {
67 // still one token from the last round around. Return that one first.
68 $token = array_shift( $this->mQueuedToken );
69 } else if ( $this->mPos > $this->mTextLength ) {
70 // If no text is left, return "false".
71 $token = false;
72 } else {
74 $token["text"]="";
75 $token["type"]="text";
77 while ( $this->mPos <= $this->mTextLength ) {
78 switch ( @$ch = $this->mText[$this->mPos] ) {
79 case 'R': // for "RFC "
80 if ( $this->continues("FC ") ) {
81 $queueToken["type"] = $queueToken["text"] = "RFC ";
82 $this->mQueuedToken[] = $queueToken;
83 $this->mPos += 3;
84 break 2; // switch + while
85 }
86 break;
87 case 'I': // for "ISBN "
88 if ( $this->continues("SBN ") ) {
89 $queueToken["type"] = $queueToken["text"] = "ISBN ";
90 $this->mQueuedToken[] = $queueToken;
91 $this->mPos += 4;
92 break 2; // switch + while
93 }
94 break;
95 case "[": // for links "[["
96 if ( $this->continues("[[") ) {
97 $queueToken["type"] = "[[[";
98 $queueToken["text"] = "";
99 $this->mQueuedToken[] = $queueToken;
100 $this->mPos += 3;
101 break 2; // switch + while
102 } else if ( $this->continues("[") ) {
103 $queueToken["type"] = "[[";
104 $queueToken["text"] = "";
105 // Check for a "prefixed link", e.g. Al[[Khazar]]
106 // Mostly for arabic wikipedia
107 if ( $this->linkPrefixExtension ) {
108 while ( $this->linkPrefixExtension
109 && ($len = strlen( $token["text"] ) ) > 0
110 && !ctype_space( $token["text"][$len-1] ) )
111 {
112 //prepend the character to the link's open tag
113 $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
114 //remove character from the end of the text token
115 $token["text"] = substr( $token["text"], 0, -1);
116 }
117 }
118 $this->mQueuedToken[] = $queueToken;
119 $this->mPos += 2;
120 break 2; // switch + while
121 }
122 break;
123 case "]": // for end of links "]]"
124 if ( $this->continues("]") ) {
125 $queueToken["type"] = "]]";
126 $queueToken["text"] = "";
127 $this->mQueuedToken[] = $queueToken;
128 $this->mPos += 2;
129 break 2; // switch + while
130 }
131 break;
132 case "'": // for all kind of em's and strong's
133 if ( $this->continues("'") ) {
134 $queueToken["type"] = "'";
135 $queueToken["text"] = "";
136 while( ($this->mPos+1 < $this->mTextLength)
137 && $this->mText[$this->mPos+1] == "'" )
138 {
139 $queueToken["type"] .= "'";
140 $this->mPos ++;
141 }
143 $this->mQueuedToken[] = $queueToken;
144 $this->mPos ++;
145 break 2; // switch + while
146 }
147 break;
148 case "\n": // for block levels, actually, only "----" is handled.
149 case "\r":
150 if ( $this->continues( "----" ) )
151 {
152 $queueToken["type"] = "----";
153 $queueToken["text"] = "";
154 $this->mQueuedToken[] = $queueToken;
155 $this->mPos += 5;
156 while ( $this->mPos<$this->mTextLength
157 and $this->mText[$this->mPos] == "-" )
158 {
159 $this->mPos ++;
160 }
161 break 2;
162 }
163 } /* switch */
164 $token["text"].=$ch;
165 $this->mPos ++;
166 // echo $this->mPos . "<br>\n";
167 } /* while */
168 } /* if (nothing left in queue) */
170 wfProfileOut( $fname );
171 return $token;
172 }
174 // function continues
175 // checks whether the mText continues with $cont from mPos+1
176 /* private */ function continues( $cont )
177 {
178 // If string is not long enough to contain $cont, return false
179 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
180 return false;
181 for ( $i=0; $i < strlen( $cont ); $i++ )
182 {
183 if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
184 return false;
185 }
186 return true;
187 }
189 }