Leetcode[187] Repeated DNA Sequences

###Task1 All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: “ACGAATTCCG”. When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

For example,

Given s = “AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT”,

Return: [“AAAAACCCCC”, “CCCCCAAAAA”].

###Python ####Dict

class Solution(object):
    def findRepeatedDnaSequences(self, s):
        """
        :type s: str
        :rtype: List[str]
        """
        if len(s) < 10:
            return []
        map = {}
        for i in range(len(s) - 9):
            if s[i:i+10] in map:
                map[s[i:i+10]] += 1
            else:
                map[s[i:i+10]] = 1
        ret = []
        for substr in map:
            if map[substr] > 1:
               ret.append(substr)
        return ret

####Bit number + dict

class Solution(object):
    def findRepeatedDnaSequences(self, s):
        """
        :type s: str
        :rtype: List[str]
        """
        if len(s) < 10:
            return []
        map_val = {}
        map_digit = {'A' : 0,
                     'C' : 1,
                     'G' : 2,
                     'T' : 3}
        sum = 0
        ret = []
        for i in range(len(s)):
            sum = (sum * 4 + map_digit[s[i]]) & 0xFFFFF
            if i < 9:
                continue
            map_val[sum] = map_val.get(sum, 0) + 1
            if map_val[sum] == 2:
                ret.append(s[i - 9:i + 1])
                
        return ret

###Java

import java.util.*;

public class Solution {
    public List<String> findRepeatedDnaSequences(String s) {
    	List<String> list = new ArrayList<String>();
        if (s == null || s.length() < 10) {
        	return list;
        }

        HashMap<Character, Integer> map = new HashMap<Character, Integer>();
        map.put('A', 0);
        map.put('C', 1);
        map.put('G', 2);
        map.put('T', 3);

        HashSet<Integer> first = new HashSet<Integer>();
        HashSet<Integer> second = new HashSet<Integer>();
        int hash = 0;
        for (int i = 0; i < s.length(); i++) {

            if (i < 9) {
                hash = (hash << 2) + map.get(s.charAt(i)); //add current char into hash
            } else {
                hash = (hash << 2) + map.get(s.charAt(i));
                hash = hash & (1 << 20) - 1; //remove those bits in hash that exceeds 20 bits
                // 1 << 20 - 1 generate 20 '1'
                if (first.contains(hash) && !second.contains(hash)) {
                    second.add(hash);
                    list.add(s.substring(i - 9, i + 1));
                } else {
                    first.add(hash);
                }
            }
        }

        return list;
    }

    public static void main(String[] args) {
    	Solution sol = new Solution();
    	List<String> res = sol.findRepeatedDnaSequences("AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT");
    	for (String s: res) {
    		System.out.println(s);
    	}
    }
}

###Points

字典+位运算,或者进制转换。由于直接将字符串存入字典会导致Memory Limit Exceeded,采用位操作将字符串转化为整数可以减少内存开销。