001    /**
002     * Copyright (c) 2010, SIB. All rights reserved.
003     * 
004     * SIB (Swiss Institute of Bioinformatics) - http://www.isb-sib.ch Host -
005     * https://sourceforge.net/projects/javaprotlib/
006     * 
007     * Redistribution and use in source and binary forms, with or without
008     * modification, are permitted provided that the following conditions are met:
009     * Redistributions of source code must retain the above copyright notice, this
010     * list of conditions and the following disclaimer. Redistributions in binary
011     * form must reproduce the above copyright notice, this list of conditions and
012     * the following disclaimer in the documentation and/or other materials provided
013     * with the distribution. Neither the name of the SIB/GENEBIO nor the names of
014     * its contributors may be used to endorse or promote products derived from this
015     * software without specific prior written permission.
016     * 
017     * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018     * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019     * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
020     * ARE DISCLAIMED. IN NO EVENT SHALL SIB/GENEBIO BE LIABLE FOR ANY DIRECT,
021     * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022     * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023     * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
024     * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025     * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026     * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027     */
028    package org.expasy.jpl.commons.collection.symbol.seq;
029    
030    
031    import java.io.Serializable;
032    import java.text.ParseException;
033    import java.util.ArrayList;
034    import java.util.List;
035    import java.util.regex.MatchResult;
036    import java.util.regex.Matcher;
037    import java.util.regex.Pattern;
038    import org.expasy.jpl.commons.base.builder.BuilderException;
039    import org.expasy.jpl.commons.base.builder.InstanceBuilder;
040    import org.expasy.jpl.commons.collection.symbol.DataSymbolRegister;
041    import org.expasy.jpl.commons.collection.symbol.Symbol;
042    import org.expasy.jpl.commons.collection.symbol.Symbol.SymbolType;
043    
044    
045    /**
046     * A sequence of symbol letters defined by an alphabet.
047     * 
048     * @param <E> the element type.
049     * 
050     * @author nikitin
051     * 
052     * @version 1.0
053     * 
054     */
055    public final class SymbolSequenceImpl<E> implements SymbolSequence<E>,
056        Serializable {
057            
058            private static final long serialVersionUID = 1L;
059            
060            /** the builder that build instance of SymbolSequenceImpl */
061            private transient Builder<E> builder;
062            
063            /** the symbols (only for the root sequence) */
064            private char[] symbols;
065            
066            /** the symbol type */
067            private SymbolType<E> symbolType;
068            
069            /** manage the access of symbols */
070            private DataSymbolRegister<E> manager;
071            
072            /** store the ambiguous symbol positions */
073            private List<Integer> ambiPos;
074            
075            /** the beginning and length of subsequence relative to the root */
076            private int startIndexInRoot;
077            private int length;
078            
079            /** the sequence root of the sequence hierarchy */
080            private SymbolSequenceImpl<E> root;
081            
082            /** the direct tree parent of the current sequence (null for root) */
083            private SymbolSequenceImpl<E> parent;
084            
085            /**
086             * This builder builds an instance of JPLAASequence given a numerous number
087             * of possible parameters.
088             * 
089             * @author nikitin
090             * 
091             */
092            public static class Builder<E> implements
093                InstanceBuilder<SymbolSequenceImpl<E>> {
094                    
095                    // required parameters
096                    private String sequenceString;
097                    private SymbolSequenceImpl<E> parentSequence;
098                    
099                    // optional parameters
100                    private SymbolType<E> type = null;
101                    private int from = 0;
102                    private int to = 0;
103                    private boolean ambiguityEnabled = false;
104                    
105                    /**
106                     * The main builder constructor.
107                     * 
108                     * @param sequenceString the sequence string representation
109                     */
110                    public Builder(final String sequenceString, SymbolType<E> type) {
111                            this.sequenceString = sequenceString;
112                            this.to = sequenceString.length();
113                            this.type = type;
114                    }
115                    
116                    /**
117                     * An alternative builder constructor.
118                     * 
119                     * @param sequence the sequence to copy.
120                     */
121                    public Builder(final SymbolSequenceImpl<E> sequence) {
122                            this.parentSequence = sequence;
123                            if (sequence != null) {
124                                    this.to = sequence.length();
125                            }
126                            this.type = sequence.getSymbolType();
127                    }
128                    
129                    public Builder<E> ambiguityEnabled() {
130                            this.ambiguityEnabled = true;
131                            return this;
132                    }
133                    
134                    public Builder<E> from(final int from) {
135                            this.from = from;
136                            return this;
137                    }
138                    
139                    public Builder<E> to(final int to) {
140                            this.to = to;
141                            return this;
142                    }
143                    
144                    public boolean allowAmbiguity() {
145                            return ambiguityEnabled;
146                    }
147                    
148                    public int getSeqLen() {
149                            return to - from;
150                    }
151                    
152                    /** test if language defined by symbol's alphabet is well recognized */
153                    private static <E> void checkLanguage(final SymbolType<E> type,
154                        final String sequence) {
155                            
156                            // pattern that must match the overall sequence
157                            Pattern pat = getSequencePattern(type);
158                            
159                            final Matcher matches = pat.matcher(sequence);
160                            
161                            if (matches.find()) {
162                                    final MatchResult result = matches.toMatchResult();
163                                    
164                                    if (result.start() != 0) {
165                                            throw new BuilderException(sequence
166                                                + ": parse error at position " + (result.start() - 1));
167                                    } else if (result.end() != sequence.length()) {
168                                            throw new BuilderException(sequence
169                                                + ": parse error at position " + (result.end()));
170                                    }
171                            } else {
172                                    throw new BuilderException("parse error: " + sequence
173                                        + " does not match " + pat.pattern());
174                            }
175                    }
176                    
177                    private void checkAndSetIntervalIndices(int rootLen) {
178                            
179                            // allows negative from index -> offset from the end
180                            if (from < 0) {
181                                    this.from = rootLen + from;
182                            }
183                            
184                            if (to < 0) {
185                                    this.to = rootLen + to;
186                            }
187                            
188                            // testing parameters
189                            if (from >= to) {
190                                    throw new BuilderException("bad interval values: [" + from
191                                        + "-" + to + "[");
192                            }
193                    }
194                    
195                    private void checkSequenceStringParams() {
196                            checkLanguage(type, sequenceString);
197                            
198                            checkAndSetIntervalIndices(sequenceString.length());
199                            
200                            // get substring if needed
201                            if ((from > 0) || (to < sequenceString.length())) {
202                                    sequenceString = sequenceString.substring(from, to);
203                                    this.from = 0;
204                                    this.to = sequenceString.length();
205                            }
206                    }
207                    
208                    public SymbolSequenceImpl<E> build() throws BuilderException {
209                            
210                            if (type == null) {
211                                    throw new BuilderException("missing valid symbol type.");
212                            }
213                            
214                            if ((sequenceString != null) && (sequenceString.length() > 0)) {
215                                    checkSequenceStringParams();
216                            } else if (parentSequence != null) {
217                                    checkAndSetIntervalIndices(parentSequence.length());
218                            } else {
219                                    throw new BuilderException("need a valid sequence string "
220                                        + "or a sequence instance to build instance from.");
221                            }
222                            
223                            if (type.getDataSymbolManager() == null) {
224                                    throw new BuilderException("symbol manager undefined");
225                            }
226                            
227                            try {
228                                    return new SymbolSequenceImpl<E>(this);
229                            } catch (final ParseException e) {
230                                    throw new BuilderException(e);
231                            }
232                    }
233            }
234            
235            /** for serialization only */
236            public SymbolSequenceImpl() {}
237            
238            /**
239             * The only constructor that build an instance of JPLAASequence.
240             * 
241             * @param builder the builder needed to build the instance.
242             * @throws ParseException
243             * @throws ParseException if the sequence string is not well formatted.
244             * @throws JPLAAByteUndefinedException
245             */
246            protected SymbolSequenceImpl(final Builder<E> builder)
247                throws ParseException {
248                    
249                    symbolType = builder.type;
250                    manager = symbolType.getDataSymbolManager();
251                    
252                    length = builder.to - builder.from;
253                    
254                    // instanciation from string (root instance case)
255                    if (builder.sequenceString != null) {
256                            
257                            // init list of potential ambiguous sites
258                            ambiPos = new ArrayList<Integer>();
259                            
260                            // only the sequence root has a byte array of aas.
261                            setCharsFromStr(builder);
262                            
263                            root = this;
264                            
265                    }
266                    // instanciation from a sequence instance (parent instance case)
267                    else if (builder.parentSequence != null) {
268                            parent = builder.parentSequence;
269                            root = parent.root;
270                            
271                            // compute root relative indices
272                            startIndexInRoot = builder.from + parent.startIndexInRoot;
273                            
274                            // a link to the ambiguous sites
275                            ambiPos = parent.ambiPos;
276                    }
277                    this.builder = builder;
278            }
279            
280            public static <E> Pattern getSequencePattern(SymbolType<E> type) {
281                    return Pattern.compile("^" + type.getAlphabet().getRegEx() + "+$");
282            }
283            
284            public Builder<E> getBuilder() {
285                    return builder;
286            }
287            
288            @SuppressWarnings("unchecked")
289            @Override
290            public SymbolSequenceImpl<E> clone() {
291                    SymbolSequenceImpl<E> clone = null;
292                    try {
293                            clone = (SymbolSequenceImpl) super.clone();
294                            
295                            clone.ambiPos = new ArrayList<Integer>(ambiPos);
296                            clone.length = length;
297                            clone.parent = parent;
298                            clone.root = root;
299                            clone.startIndexInRoot = startIndexInRoot;
300                            clone.symbols = symbols.clone();
301                            clone.symbolType = symbolType;
302                            clone.manager = manager;
303                    } catch (CloneNotSupportedException e) {
304                            throw new IllegalStateException("cannot make a clone from " + this,
305                                e);
306                    }
307                    
308                    return clone;
309            }
310            
311            /**
312             * Return true if 2 aa sequences are identical
313             */
314            @Override
315            @SuppressWarnings("unchecked")
316            public boolean equals(final Object obj) {
317                    if ((obj instanceof SymbolSequenceImpl)
318                        && ((SymbolSequenceImpl) obj).toString().equals(this.toString())) {
319                            return true;
320                    }
321                    return false;
322            }
323            
324            /**
325             * Same length JPLAASequence have the same hash code.
326             */
327            @Override
328            public int hashCode() {
329                    return toSymbolString().hashCode();
330            }
331            
332            /**
333             * Create array of aa bytes for the root instance given amino-acids.
334             * 
335             * @param aas amino-acids to convert to byte-array.
336             * 
337             * @throws ParseException if aas are not well formatted.
338             */
339            private void setCharsFromStr(Builder<E> builder) throws ParseException {
340                    
341                    String seq = builder.sequenceString;
342                    
343                    symbols = new char[length];
344                    
345                    /* convert each character to byte */
346                    for (int i = startIndexInRoot; i < length; i++) {
347                            symbols[i] = seq.charAt(i);
348                            
349                            if (!manager.lookupSymbolNode(symbols[i]).isLeave()) {
350                                    
351                                    if (!builder.ambiguityEnabled) {
352                                            throw new ParseException(symbols[i]
353                                                + " is ambiguous in sequence " + seq, i);
354                                    }
355                                    
356                                    ambiPos.add(i);
357                            }
358                            
359                    }
360            }
361            
362            /**
363             * @return the length of the amino-acid sequence.
364             */
365            public final int length() {
366                    return length;
367            }
368            
369            public boolean isAmbiguous() {
370                    
371                    if (ambiPos.size() == 0) {
372                            return false;
373                    }
374                    
375                    int end = startIndexInRoot + length;
376                    for (int pos : ambiPos) {
377                            if (pos >= startIndexInRoot && pos < end) {
378                                    return true;
379                            }
380                    }
381                    
382                    return false;
383            }
384            
385            public final SymbolType<E> getSymbolType() {
386                    return symbolType;
387            }
388            
389            /**
390             * Returns the symbol character at position i.
391             * 
392             * @return the symbol character at position i.
393             * 
394             * @throws SequenceOutOfBoundsException if i is out of byte array bounds.
395             */
396            public final Symbol<E> getSymbolAt(final int i) {
397                    
398                    try {
399                            checkValidPositionOnRoot(i);
400                            return manager.lookupSymbol(root.charAt(startIndexInRoot + i));
401                    } catch (final ArrayIndexOutOfBoundsException e) {
402                            throw new SequenceOutOfBoundsException("Illegal position ' "
403                                + (startIndexInRoot + i) + "', valid interval in ["
404                                + (startIndexInRoot + i) + ":" + length + "[");
405                    }
406            }
407            
408            public final E valueAt(final int i) {
409                    return manager.lookupData(charAt(i));
410            }
411            
412            public final char charAt(final int i) {
413                    
414                    try {
415                            checkValidPositionOnRoot(i);
416                            return root.symbols[startIndexInRoot + i];
417                    } catch (final ArrayIndexOutOfBoundsException e) {
418                            throw new SequenceOutOfBoundsException("Illegal position ' " + i
419                                + "', valid interval in [" + 0 + ":" + length + "[");
420                    }
421            }
422            
423            /**
424             * Warning: [start, end[
425             */
426            public SymbolSequenceImpl<E> subSequence(int start, int end) {
427                    try {
428                            if (start < 0) {
429                                    throw new SequenceOutOfBoundsException(
430                                        "Illegal negative start index ' " + start
431                                            + "', valid interval in [" + 0 + ":" + end + "[");
432                            }
433                            return new SymbolSequenceImpl.Builder<E>(this).from(start).to(end)
434                                .build();
435                    } catch (final BuilderException e) {
436                            throw new SequenceOutOfBoundsException("Illegal position ' "
437                                + start + "', valid interval in [" + 0 + ":" + end + "]");
438                    }
439                    
440            }
441            
442            /**
443             * Check that position index is valid in the aa sequence member.
444             * 
445             * @param position position to check.
446             * 
447             * @throws SequenceOutOfBoundsException if index is out of bounds.
448             */
449            private final void checkValidPositionOnRoot(int position) {
450                    position += startIndexInRoot;
451                    if ((position < root.startIndexInRoot) || (position > root.length)) {
452                            throw new SequenceOutOfBoundsException(position
453                                + " is out of range : [" + root.startIndexInRoot + ":"
454                                + root.length + "[");
455                    }
456            }
457            
458            public final String toSymbolString() {
459                    final StringBuffer sb = new StringBuffer();
460                    
461                    for (int i = startIndexInRoot; i < startIndexInRoot + length; i++) {
462                            sb.append(root.symbols[i]);
463                    }
464                    
465                    return sb.toString();
466            }
467            
468            /**
469             * @return the amino acid string without the N and C terminus informations.
470             */
471            @Override
472            public String toString() {
473                    return toSymbolString();
474            }
475            
476    }