001 /**
002 * Copyright (c) 2010, SIB. All rights reserved.
003 *
004 * SIB (Swiss Institute of Bioinformatics) - http://www.isb-sib.ch Host -
005 * https://sourceforge.net/projects/javaprotlib/
006 *
007 * Redistribution and use in source and binary forms, with or without
008 * modification, are permitted provided that the following conditions are met:
009 * Redistributions of source code must retain the above copyright notice, this
010 * list of conditions and the following disclaimer. Redistributions in binary
011 * form must reproduce the above copyright notice, this list of conditions and
012 * the following disclaimer in the documentation and/or other materials provided
013 * with the distribution. Neither the name of the SIB/GENEBIO nor the names of
014 * its contributors may be used to endorse or promote products derived from this
015 * software without specific prior written permission.
016 *
017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
020 * ARE DISCLAIMED. IN NO EVENT SHALL SIB/GENEBIO BE LIABLE FOR ANY DIRECT,
021 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
024 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027 */
028 package org.expasy.jpl.commons.collection.symbol.seq;
029
030
031 import java.io.Serializable;
032 import java.text.ParseException;
033 import java.util.ArrayList;
034 import java.util.List;
035 import java.util.regex.MatchResult;
036 import java.util.regex.Matcher;
037 import java.util.regex.Pattern;
038 import org.expasy.jpl.commons.base.builder.BuilderException;
039 import org.expasy.jpl.commons.base.builder.InstanceBuilder;
040 import org.expasy.jpl.commons.collection.symbol.DataSymbolRegister;
041 import org.expasy.jpl.commons.collection.symbol.Symbol;
042 import org.expasy.jpl.commons.collection.symbol.Symbol.SymbolType;
043
044
045 /**
046 * A sequence of symbol letters defined by an alphabet.
047 *
048 * @param <E> the element type.
049 *
050 * @author nikitin
051 *
052 * @version 1.0
053 *
054 */
055 public final class SymbolSequenceImpl<E> implements SymbolSequence<E>,
056 Serializable {
057
058 private static final long serialVersionUID = 1L;
059
060 /** the builder that build instance of SymbolSequenceImpl */
061 private transient Builder<E> builder;
062
063 /** the symbols (only for the root sequence) */
064 private char[] symbols;
065
066 /** the symbol type */
067 private SymbolType<E> symbolType;
068
069 /** manage the access of symbols */
070 private DataSymbolRegister<E> manager;
071
072 /** store the ambiguous symbol positions */
073 private List<Integer> ambiPos;
074
075 /** the beginning and length of subsequence relative to the root */
076 private int startIndexInRoot;
077 private int length;
078
079 /** the sequence root of the sequence hierarchy */
080 private SymbolSequenceImpl<E> root;
081
082 /** the direct tree parent of the current sequence (null for root) */
083 private SymbolSequenceImpl<E> parent;
084
085 /**
086 * This builder builds an instance of JPLAASequence given a numerous number
087 * of possible parameters.
088 *
089 * @author nikitin
090 *
091 */
092 public static class Builder<E> implements
093 InstanceBuilder<SymbolSequenceImpl<E>> {
094
095 // required parameters
096 private String sequenceString;
097 private SymbolSequenceImpl<E> parentSequence;
098
099 // optional parameters
100 private SymbolType<E> type = null;
101 private int from = 0;
102 private int to = 0;
103 private boolean ambiguityEnabled = false;
104
105 /**
106 * The main builder constructor.
107 *
108 * @param sequenceString the sequence string representation
109 */
110 public Builder(final String sequenceString, SymbolType<E> type) {
111 this.sequenceString = sequenceString;
112 this.to = sequenceString.length();
113 this.type = type;
114 }
115
116 /**
117 * An alternative builder constructor.
118 *
119 * @param sequence the sequence to copy.
120 */
121 public Builder(final SymbolSequenceImpl<E> sequence) {
122 this.parentSequence = sequence;
123 if (sequence != null) {
124 this.to = sequence.length();
125 }
126 this.type = sequence.getSymbolType();
127 }
128
129 public Builder<E> ambiguityEnabled() {
130 this.ambiguityEnabled = true;
131 return this;
132 }
133
134 public Builder<E> from(final int from) {
135 this.from = from;
136 return this;
137 }
138
139 public Builder<E> to(final int to) {
140 this.to = to;
141 return this;
142 }
143
144 public boolean allowAmbiguity() {
145 return ambiguityEnabled;
146 }
147
148 public int getSeqLen() {
149 return to - from;
150 }
151
152 /** test if language defined by symbol's alphabet is well recognized */
153 private static <E> void checkLanguage(final SymbolType<E> type,
154 final String sequence) {
155
156 // pattern that must match the overall sequence
157 Pattern pat = getSequencePattern(type);
158
159 final Matcher matches = pat.matcher(sequence);
160
161 if (matches.find()) {
162 final MatchResult result = matches.toMatchResult();
163
164 if (result.start() != 0) {
165 throw new BuilderException(sequence
166 + ": parse error at position " + (result.start() - 1));
167 } else if (result.end() != sequence.length()) {
168 throw new BuilderException(sequence
169 + ": parse error at position " + (result.end()));
170 }
171 } else {
172 throw new BuilderException("parse error: " + sequence
173 + " does not match " + pat.pattern());
174 }
175 }
176
177 private void checkAndSetIntervalIndices(int rootLen) {
178
179 // allows negative from index -> offset from the end
180 if (from < 0) {
181 this.from = rootLen + from;
182 }
183
184 if (to < 0) {
185 this.to = rootLen + to;
186 }
187
188 // testing parameters
189 if (from >= to) {
190 throw new BuilderException("bad interval values: [" + from
191 + "-" + to + "[");
192 }
193 }
194
195 private void checkSequenceStringParams() {
196 checkLanguage(type, sequenceString);
197
198 checkAndSetIntervalIndices(sequenceString.length());
199
200 // get substring if needed
201 if ((from > 0) || (to < sequenceString.length())) {
202 sequenceString = sequenceString.substring(from, to);
203 this.from = 0;
204 this.to = sequenceString.length();
205 }
206 }
207
208 public SymbolSequenceImpl<E> build() throws BuilderException {
209
210 if (type == null) {
211 throw new BuilderException("missing valid symbol type.");
212 }
213
214 if ((sequenceString != null) && (sequenceString.length() > 0)) {
215 checkSequenceStringParams();
216 } else if (parentSequence != null) {
217 checkAndSetIntervalIndices(parentSequence.length());
218 } else {
219 throw new BuilderException("need a valid sequence string "
220 + "or a sequence instance to build instance from.");
221 }
222
223 if (type.getDataSymbolManager() == null) {
224 throw new BuilderException("symbol manager undefined");
225 }
226
227 try {
228 return new SymbolSequenceImpl<E>(this);
229 } catch (final ParseException e) {
230 throw new BuilderException(e);
231 }
232 }
233 }
234
235 /** for serialization only */
236 public SymbolSequenceImpl() {}
237
238 /**
239 * The only constructor that build an instance of JPLAASequence.
240 *
241 * @param builder the builder needed to build the instance.
242 * @throws ParseException
243 * @throws ParseException if the sequence string is not well formatted.
244 * @throws JPLAAByteUndefinedException
245 */
246 protected SymbolSequenceImpl(final Builder<E> builder)
247 throws ParseException {
248
249 symbolType = builder.type;
250 manager = symbolType.getDataSymbolManager();
251
252 length = builder.to - builder.from;
253
254 // instanciation from string (root instance case)
255 if (builder.sequenceString != null) {
256
257 // init list of potential ambiguous sites
258 ambiPos = new ArrayList<Integer>();
259
260 // only the sequence root has a byte array of aas.
261 setCharsFromStr(builder);
262
263 root = this;
264
265 }
266 // instanciation from a sequence instance (parent instance case)
267 else if (builder.parentSequence != null) {
268 parent = builder.parentSequence;
269 root = parent.root;
270
271 // compute root relative indices
272 startIndexInRoot = builder.from + parent.startIndexInRoot;
273
274 // a link to the ambiguous sites
275 ambiPos = parent.ambiPos;
276 }
277 this.builder = builder;
278 }
279
280 public static <E> Pattern getSequencePattern(SymbolType<E> type) {
281 return Pattern.compile("^" + type.getAlphabet().getRegEx() + "+$");
282 }
283
284 public Builder<E> getBuilder() {
285 return builder;
286 }
287
288 @SuppressWarnings("unchecked")
289 @Override
290 public SymbolSequenceImpl<E> clone() {
291 SymbolSequenceImpl<E> clone = null;
292 try {
293 clone = (SymbolSequenceImpl) super.clone();
294
295 clone.ambiPos = new ArrayList<Integer>(ambiPos);
296 clone.length = length;
297 clone.parent = parent;
298 clone.root = root;
299 clone.startIndexInRoot = startIndexInRoot;
300 clone.symbols = symbols.clone();
301 clone.symbolType = symbolType;
302 clone.manager = manager;
303 } catch (CloneNotSupportedException e) {
304 throw new IllegalStateException("cannot make a clone from " + this,
305 e);
306 }
307
308 return clone;
309 }
310
311 /**
312 * Return true if 2 aa sequences are identical
313 */
314 @Override
315 @SuppressWarnings("unchecked")
316 public boolean equals(final Object obj) {
317 if ((obj instanceof SymbolSequenceImpl)
318 && ((SymbolSequenceImpl) obj).toString().equals(this.toString())) {
319 return true;
320 }
321 return false;
322 }
323
324 /**
325 * Same length JPLAASequence have the same hash code.
326 */
327 @Override
328 public int hashCode() {
329 return toSymbolString().hashCode();
330 }
331
332 /**
333 * Create array of aa bytes for the root instance given amino-acids.
334 *
335 * @param aas amino-acids to convert to byte-array.
336 *
337 * @throws ParseException if aas are not well formatted.
338 */
339 private void setCharsFromStr(Builder<E> builder) throws ParseException {
340
341 String seq = builder.sequenceString;
342
343 symbols = new char[length];
344
345 /* convert each character to byte */
346 for (int i = startIndexInRoot; i < length; i++) {
347 symbols[i] = seq.charAt(i);
348
349 if (!manager.lookupSymbolNode(symbols[i]).isLeave()) {
350
351 if (!builder.ambiguityEnabled) {
352 throw new ParseException(symbols[i]
353 + " is ambiguous in sequence " + seq, i);
354 }
355
356 ambiPos.add(i);
357 }
358
359 }
360 }
361
362 /**
363 * @return the length of the amino-acid sequence.
364 */
365 public final int length() {
366 return length;
367 }
368
369 public boolean isAmbiguous() {
370
371 if (ambiPos.size() == 0) {
372 return false;
373 }
374
375 int end = startIndexInRoot + length;
376 for (int pos : ambiPos) {
377 if (pos >= startIndexInRoot && pos < end) {
378 return true;
379 }
380 }
381
382 return false;
383 }
384
385 public final SymbolType<E> getSymbolType() {
386 return symbolType;
387 }
388
389 /**
390 * Returns the symbol character at position i.
391 *
392 * @return the symbol character at position i.
393 *
394 * @throws SequenceOutOfBoundsException if i is out of byte array bounds.
395 */
396 public final Symbol<E> getSymbolAt(final int i) {
397
398 try {
399 checkValidPositionOnRoot(i);
400 return manager.lookupSymbol(root.charAt(startIndexInRoot + i));
401 } catch (final ArrayIndexOutOfBoundsException e) {
402 throw new SequenceOutOfBoundsException("Illegal position ' "
403 + (startIndexInRoot + i) + "', valid interval in ["
404 + (startIndexInRoot + i) + ":" + length + "[");
405 }
406 }
407
408 public final E valueAt(final int i) {
409 return manager.lookupData(charAt(i));
410 }
411
412 public final char charAt(final int i) {
413
414 try {
415 checkValidPositionOnRoot(i);
416 return root.symbols[startIndexInRoot + i];
417 } catch (final ArrayIndexOutOfBoundsException e) {
418 throw new SequenceOutOfBoundsException("Illegal position ' " + i
419 + "', valid interval in [" + 0 + ":" + length + "[");
420 }
421 }
422
423 /**
424 * Warning: [start, end[
425 */
426 public SymbolSequenceImpl<E> subSequence(int start, int end) {
427 try {
428 if (start < 0) {
429 throw new SequenceOutOfBoundsException(
430 "Illegal negative start index ' " + start
431 + "', valid interval in [" + 0 + ":" + end + "[");
432 }
433 return new SymbolSequenceImpl.Builder<E>(this).from(start).to(end)
434 .build();
435 } catch (final BuilderException e) {
436 throw new SequenceOutOfBoundsException("Illegal position ' "
437 + start + "', valid interval in [" + 0 + ":" + end + "]");
438 }
439
440 }
441
442 /**
443 * Check that position index is valid in the aa sequence member.
444 *
445 * @param position position to check.
446 *
447 * @throws SequenceOutOfBoundsException if index is out of bounds.
448 */
449 private final void checkValidPositionOnRoot(int position) {
450 position += startIndexInRoot;
451 if ((position < root.startIndexInRoot) || (position > root.length)) {
452 throw new SequenceOutOfBoundsException(position
453 + " is out of range : [" + root.startIndexInRoot + ":"
454 + root.length + "[");
455 }
456 }
457
458 public final String toSymbolString() {
459 final StringBuffer sb = new StringBuffer();
460
461 for (int i = startIndexInRoot; i < startIndexInRoot + length; i++) {
462 sb.append(root.symbols[i]);
463 }
464
465 return sb.toString();
466 }
467
468 /**
469 * @return the amino acid string without the N and C terminus informations.
470 */
471 @Override
472 public String toString() {
473 return toSymbolString();
474 }
475
476 }