marketplace-dns-2014/network-query.rkt

#lang racket/base
;;
;;; Copyright 2010, 2011, 2012, 2013 Tony Garnock-Jones <tonyg@ccs.neu.edu>
;;;
;;; This file is part of marketplace-dns.
;;;
;;; marketplace-dns is free software: you can redistribute it and/or
;;; modify it under the terms of the GNU General Public License as
;;; published by the Free Software Foundation, either version 3 of the
;;; License, or (at your option) any later version.
;;;
;;; marketplace-dns is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with marketplace-dns. If not, see
;;; <http://www.gnu.org/licenses/>.

(require racket/set)
(require racket/match)
(require "api.rkt")
(require "codec.rkt")
(require "zonedb.rkt")
(require syndicate/actor)
(require syndicate/drivers/udp)
(require syndicate/drivers/timer)
(require "tk-dns.rkt")

(provide network-query
	 (struct-out network-reply))

;; DJB's rules for handling DNS responses. Some of these are handled
;; here (specifically, rules 2 through 5, in the action of
;; filter-dns-reply), some are handled in resolver.rkt (rule 1, in the
;; action of answer-from-zone), and some are handled in the
;; interaction between the resolver and the network-query modules
;; (rule 1 as well, the interplay between CNAME expansion and
;; recursion):

;; <blockquote>
;; When a cache receives a normal DNS response, it learns exactly one
;; of the following five pieces of information:
;;
;; 1. ``The query was not answered because the query name is an
;; alias. I need to change the query name and try again.'' This
;; applies if the answer section of the response contains a CNAME
;; record for the query name and CNAME does not match the query type.
;;
;; 2. ``The query name has no records answering the query, and is also
;; guaranteed to have no records of any other type.'' This applies if
;; the response code is NXDOMAIN and #1 doesn't apply. The amount of
;; time that this information can be cached depends on the contents of
;; the SOA record in the authority section of the response, if there
;; is one.
;;
;; 3. ``The query name has one or more records answering the query.''
;; This applies if the answer section of the response contains one or
;; more records under the query name matching the query type, and #1
;; doesn't apply, and #2 doesn't apply.
;;
;; 4. ``The query was not answered because the server does not have
;; the answer. I need to contact other servers.'' This applies if the
;; authority section of the response contains NS records, and the
;; authority section of the response does not contain SOA records, and
;; #1 doesn't apply, and #2 doesn't apply, and #3 doesn't apply. The
;; ``other servers'' are named in the NS records in the authority
;; section.
;;
;; 5. ``The query name has no records answering the query, but it may
;; have records of another type.'' This applies if #1 doesn't apply,
;; and #2 doesn't apply, and #3 doesn't apply, and #4 doesn't
;; apply. The amount of time that this information can be cached
;; depends on the contents of the SOA record in the authority section,
;; if there is one.
;;
;; This procedure requires an incredible amount of bug-prone parsing
;; for a very small amount of information. The underlying problem is
;; that DNS was designed to declare information in a human-oriented
;; format, rather than to support crucial operations in the simplest
;; possible way.
;; </blockquote>

;;---------------------------------------------------------------------------

;; DJB's djbdns logic for determining whether a response is a lame
;; referral or not is as follows (see his query.c in areas dealing
;; with the variable "flagreferral" and calls to the function
;; "log_lame"):
;;
;; If a response -
;;
;; 1. has response-code no-error (0), and
;; 2. has no CNAME records in the answer section for the domain we're
;;    interested in, and
;; 3. has no records in the answer section for the domain and type
;;    we're interested in, and
;; 4. has no SOA records in the authority section, and
;; 5. has at least one NS record in the authority section, and
;; 6. that NS record's name is equal to our bailiwick or is not in our
;;    bailiwick,
;;
;; then it is a lame referral.
;;
;; Anything with non-zero response-code is clearly not a referral, so
;; that explains (1). If either of checks (2) and (3) fail then the
;; answer is a real, sensible answer to the question we posed. I'm not
;; 100% on why (4) is there; presumably it's to be conservative, and
;; not treat something possibly-valid as definitely-lame? Rules (5)
;; and (6) are the real heart of lameness, where a referral is given
;; to somewhere that can't be more authoritative than the responder
;; was supposed to be.
;;
;; We modify check (4) to ignore SOA records not in bailiwick, just
;; for consistency. It's correct to leave (5) and (6) alone because
;; it's incorrect for a server to refer us to anywhere at the same
;; level of the tree or further up the tree, but we do apply them to
;; every NS record rather than just the first, which is slightly
;; stricter than DJB's rule.

;;---------------------------------------------------------------------------

(define first-timeout 3) ;; seconds

;; A NetworkReply is a (network-reply UniqueID Maybe<CompleteAnswer>)
;; representing the final result of a network query.
(struct network-reply (unique-id answer) #:transparent)

;; (: next-timeout : Natural -> (Option Natural))
(define (next-timeout timeout)
  (cond
   [(equal? timeout 3) 11]
   [(equal? timeout 11) 45]
   [else #f]))

;; (: make-dns-query-message : Question Nonnegative-Integer -> DNSMessage)
(define (make-dns-query-message q query-id)
  (dns-message query-id
	       'request
	       'query
	       'non-authoritative
	       'not-truncated
	       'no-recursion-desired
	       'no-recursion-available
	       'no-error
	       (list q)
	       '()
	       '()
	       '()))

;; (define-type CheckedAnswer (U (Option CompleteAnswer) 'bad-answer 'lame-delegation))

;; (: filter-dns-reply : Question DNSMessage DomainName -> CheckedAnswer)
;; Filters RRs from the answer, authorities, and additional sections
;; of the passed-in `message`, returning the set of RRs surviving the
;; filter. RRs are only accepted if their `rr-name` falls in the
;; bailiwick of the given `zone-origin`. All of this only happens if
;; the passed-in message's `dns-message-response-code` is `'no-error`:
;; if it's `'name-error`, then `#f` is returned, and if it's any other
;; code, `'bad-answer` is returned.
;;
;; In cases where a CompleteAnswer would otherwise be returned, if the
;; answer is in fact a lame delegation (see notes above), then
;; 'lame-delegation is returned instead.
(define (filter-dns-reply q message zone-origin)
  (case (dns-message-response-code message)
    [(no-error)
     ;; (: f : (Listof RR) -> (Setof RR))
     (define (f l)
       (list->set (filter (lambda (claim-rr) (in-bailiwick? (rr-name claim-rr) zone-origin)) l)))
     ;; Here's where we do the "lame referral" check. This code is
     ;; nice and simple (though wrong) without it. Ho hum.
     (define answers (f (dns-message-answers message)))
     (define unfiltered-authorities (dns-message-authorities message))
     (define non-subzone-ns-rrs ;; all NS authorities not for a STRICT subzone of our zone-origin
       (filter (lambda (rr) (and (eqv? (rdata-type (rr-rdata rr)) 'ns)
				 (or (equal? (rr-name rr) zone-origin)
				     (not (in-bailiwick? (rr-name rr) zone-origin)))))
	       unfiltered-authorities))
     (define authorities (f unfiltered-authorities))
     (define answers-to-q ;; answers specifically to the question we asked
       (set-filter (lambda (rr) (equal? (rr-name rr) (question-name q))) answers))
     (define lame?
       (and (set-empty? (filter-by-type answers-to-q 'cname))
	    (set-empty? (filter-rrs answers-to-q (question-type q) (question-class q)))
	    (set-empty? (filter-by-type authorities 'soa))
	    (not (null? non-subzone-ns-rrs))))
     (if lame?
	 'lame-delegation
	 (complete-answer answers
			  authorities
			  (f (dns-message-additional message))))]
    [(name-error) #f]
    [else
     (log-info "Abnormal response-code ~v in response to questions ~v"
               (dns-message-response-code message)
               (dns-message-questions message))
     'bad-answer]))

;; (: ip->host-name : IPv4 -> String)
(define (ip->host-name ip-address)
  (match-define (vector a b c d) ip-address)
  (format "~a.~a.~a.~a" a b c d))

;; (: make-dns-address : IPv4 -> UdpAddress)
(define (make-dns-address ip-address)
  (udp-remote-address (ip->host-name ip-address) 53))

;; (: network-query : (All (ParentState)
;; 			UdpAddress Question DomainName (Listof DomainName) Any ->
;; 			Void))
(define (network-query s q zone-origin server-names unique-id)
  (spawn*
   #:name (list 'network-query q)
   (field [timeout first-timeout]
          [known-addresses #hash()] ;; Hash DomainName (Listof UdpAddress)
          [remaining-addresses '()] ;; Listof UdpAddress
          [current-name #f] ;; Option DomainName
          [remaining-names server-names]) ;; Listof DomainName

   (define (on-answer ans server-ip)
     (match ans
       ['bad-answer ;; can come from filter-dns-reply
        (try-next-server)]
       ['lame-delegation ;; can come from filter-dns-reply
        (log-info "Lame delegation received from ~v (at ~v) in bailiwick ~v in response to ~v"
                  (current-name)
                  server-ip
                  zone-origin
                  q)
        (when (and (current-name) server-ip)
          ;; Actually remove the offending IP address so it's never tried again.
          (known-addresses (hash-update (known-addresses)
                                        (current-name)
                                        (lambda (addrs) (remove server-ip addrs)))))
        (try-next-server)]
       [(and (or (? complete-answer?) #f) ans)
        (send! (network-reply unique-id ans))]))

   (define (try-next-server)
     (if (not (timeout))
         ;; No more timeouts to try, so give up.
         (on-answer (empty-complete-answer) #f)
         (match (remaining-addresses)
           ['()
            (match (remaining-names)
              ['()
               ;; No more addresses to try with this timeout. Refill the list
               ;; and bump the timeout and retry.
               ;; TODO: randomize ordering of servers in list. (Don't forget the initial fill.)
               (timeout (next-timeout (timeout)))
               (current-name #f)
               (remaining-names server-names)
               (try-next-server)]
              [(cons next-name new-remaining-names)
               (current-name next-name)
               (remaining-names new-remaining-names)
               (if (hash-has-key? (known-addresses) next-name)
                   (begin (remaining-addresses (hash-ref (known-addresses) (current-name)))
                          (try-next-server))
                   (let ((subq (ns-question next-name q)))
                     (react (on-start (send! subq))
                            (stop-when (message (answered-question subq $ans))
                                       (define ips
                                         (for/list [(a (extract-addresses next-name ans))]
                                           (make-dns-address a)))
                                       (known-addresses (hash-set (known-addresses) next-name ips))
                                       (remaining-addresses ips)
                                       (try-next-server)))))])]
           [(cons current-ip new-remaining-ips)
            (remaining-addresses new-remaining-ips)
            (define rpc-id (gensym 'network-query/allocate-query-id))
            (react (on-start (send! `(request ,rpc-id allocate-query-id)))
                   (stop-when (message `(reply ,rpc-id ,(? exact-nonnegative-integer? $id)))
                              (remaining-addresses new-remaining-ips)
                              (send-request id current-ip)))])))

   (define (send-request query-id server-ip)
     (define query (make-dns-query-message q query-id))
     (define reply-wait-id (list s query-id 'reply-wait))
     (define timeout-id (list s query-id 'timeout))
     (define start-time (current-inexact-milliseconds))
     (log-debug "Sending ~v ~v to ~v ~v with ~v seconds of timeout"
                q query-id
                zone-origin server-ip
                (timeout))
     ;; NB: ANALYSIS: Previous implementation of this used a
     ;; quasi-join, where one endpoint deleted the other. Here the two
     ;; stop-when clauses do a similar job. Also, we can pull the
     ;; `release-query-id` send up to an on-stop clause.
     (react (on-start
             (send! (dns-request query s server-ip))
             (send! (set-timer timeout-id (* (timeout) 1000) 'relative)))
            (on-stop
             (send! (list 'release-query-id query-id)))
            (stop-when (message (timer-expired timeout-id _))
                       (log-debug "Timed out ~v ~v to ~v ~v after ~v seconds"
                                  q query-id
                                  zone-origin server-ip
                                  (timeout))
                       (try-next-server))
            (stop-when (message (dns-reply
                                 (? (lambda (m) (= (dns-message-id m)
                                                   (dns-message-id query)))
                                    $reply-message)
                                 $source
                                 s))
                       ;; TODO: maybe receive only specifically from the queried IP address?
                       (log-debug "Answer to ~v from ~v ~v in ~v ms~n-- Answers: ~v~n-- Authorities: ~v~n-- Additional: ~v"
                                  q zone-origin server-ip
                                  (inexact->exact (round (- (current-inexact-milliseconds) start-time)))
                                  (dns-message-answers reply-message)
                                  (dns-message-authorities reply-message)
                                  (dns-message-additional reply-message))
                       (on-answer (filter-dns-reply q reply-message zone-origin) server-ip))))

   (try-next-server)))

(define ((dns-message-id-matches? expected) m)
  (= (dns-message-id m) expected))