marketplace-dns-2014/network-query.rkt

322 lines
14 KiB
Racket

#lang racket/base
;;
;;; Copyright 2010, 2011, 2012, 2013 Tony Garnock-Jones <tonyg@ccs.neu.edu>
;;;
;;; This file is part of marketplace-dns.
;;;
;;; marketplace-dns is free software: you can redistribute it and/or
;;; modify it under the terms of the GNU General Public License as
;;; published by the Free Software Foundation, either version 3 of the
;;; License, or (at your option) any later version.
;;;
;;; marketplace-dns is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with marketplace-dns. If not, see
;;; <http://www.gnu.org/licenses/>.
(require racket/set)
(require racket/match)
(require "api.rkt")
(require "codec.rkt")
(require "zonedb.rkt")
(require syndicate/actor)
(require syndicate/drivers/udp)
(require syndicate/drivers/timer)
(require "tk-dns.rkt")
(provide network-query
(struct-out network-reply))
;; DJB's rules for handling DNS responses. Some of these are handled
;; here (specifically, rules 2 through 5, in the action of
;; filter-dns-reply), some are handled in resolver.rkt (rule 1, in the
;; action of answer-from-zone), and some are handled in the
;; interaction between the resolver and the network-query modules
;; (rule 1 as well, the interplay between CNAME expansion and
;; recursion):
;; <blockquote>
;; When a cache receives a normal DNS response, it learns exactly one
;; of the following five pieces of information:
;;
;; 1. ``The query was not answered because the query name is an
;; alias. I need to change the query name and try again.'' This
;; applies if the answer section of the response contains a CNAME
;; record for the query name and CNAME does not match the query type.
;;
;; 2. ``The query name has no records answering the query, and is also
;; guaranteed to have no records of any other type.'' This applies if
;; the response code is NXDOMAIN and #1 doesn't apply. The amount of
;; time that this information can be cached depends on the contents of
;; the SOA record in the authority section of the response, if there
;; is one.
;;
;; 3. ``The query name has one or more records answering the query.''
;; This applies if the answer section of the response contains one or
;; more records under the query name matching the query type, and #1
;; doesn't apply, and #2 doesn't apply.
;;
;; 4. ``The query was not answered because the server does not have
;; the answer. I need to contact other servers.'' This applies if the
;; authority section of the response contains NS records, and the
;; authority section of the response does not contain SOA records, and
;; #1 doesn't apply, and #2 doesn't apply, and #3 doesn't apply. The
;; ``other servers'' are named in the NS records in the authority
;; section.
;;
;; 5. ``The query name has no records answering the query, but it may
;; have records of another type.'' This applies if #1 doesn't apply,
;; and #2 doesn't apply, and #3 doesn't apply, and #4 doesn't
;; apply. The amount of time that this information can be cached
;; depends on the contents of the SOA record in the authority section,
;; if there is one.
;;
;; This procedure requires an incredible amount of bug-prone parsing
;; for a very small amount of information. The underlying problem is
;; that DNS was designed to declare information in a human-oriented
;; format, rather than to support crucial operations in the simplest
;; possible way.
;; </blockquote>
;;---------------------------------------------------------------------------
;; DJB's djbdns logic for determining whether a response is a lame
;; referral or not is as follows (see his query.c in areas dealing
;; with the variable "flagreferral" and calls to the function
;; "log_lame"):
;;
;; If a response -
;;
;; 1. has response-code no-error (0), and
;; 2. has no CNAME records in the answer section for the domain we're
;; interested in, and
;; 3. has no records in the answer section for the domain and type
;; we're interested in, and
;; 4. has no SOA records in the authority section, and
;; 5. has at least one NS record in the authority section, and
;; 6. that NS record's name is equal to our bailiwick or is not in our
;; bailiwick,
;;
;; then it is a lame referral.
;;
;; Anything with non-zero response-code is clearly not a referral, so
;; that explains (1). If either of checks (2) and (3) fail then the
;; answer is a real, sensible answer to the question we posed. I'm not
;; 100% on why (4) is there; presumably it's to be conservative, and
;; not treat something possibly-valid as definitely-lame? Rules (5)
;; and (6) are the real heart of lameness, where a referral is given
;; to somewhere that can't be more authoritative than the responder
;; was supposed to be.
;;
;; We modify check (4) to ignore SOA records not in bailiwick, just
;; for consistency. It's correct to leave (5) and (6) alone because
;; it's incorrect for a server to refer us to anywhere at the same
;; level of the tree or further up the tree, but we do apply them to
;; every NS record rather than just the first, which is slightly
;; stricter than DJB's rule.
;;---------------------------------------------------------------------------
(define first-timeout 3) ;; seconds
;; A NetworkReply is a (network-reply UniqueID Maybe<CompleteAnswer>)
;; representing the final result of a network query.
(struct network-reply (unique-id answer) #:transparent)
;; (: next-timeout : Natural -> (Option Natural))
(define (next-timeout timeout)
(cond
[(equal? timeout 3) 11]
[(equal? timeout 11) 45]
[else #f]))
;; (: make-dns-query-message : Question Nonnegative-Integer -> DNSMessage)
(define (make-dns-query-message q query-id)
(dns-message query-id
'request
'query
'non-authoritative
'not-truncated
'no-recursion-desired
'no-recursion-available
'no-error
(list q)
'()
'()
'()))
;; (define-type CheckedAnswer (U (Option CompleteAnswer) 'bad-answer 'lame-delegation))
;; (: filter-dns-reply : Question DNSMessage DomainName -> CheckedAnswer)
;; Filters RRs from the answer, authorities, and additional sections
;; of the passed-in `message`, returning the set of RRs surviving the
;; filter. RRs are only accepted if their `rr-name` falls in the
;; bailiwick of the given `zone-origin`. All of this only happens if
;; the passed-in message's `dns-message-response-code` is `'no-error`:
;; if it's `'name-error`, then `#f` is returned, and if it's any other
;; code, `'bad-answer` is returned.
;;
;; In cases where a CompleteAnswer would otherwise be returned, if the
;; answer is in fact a lame delegation (see notes above), then
;; 'lame-delegation is returned instead.
(define (filter-dns-reply q message zone-origin)
(case (dns-message-response-code message)
[(no-error)
;; (: f : (Listof RR) -> (Setof RR))
(define (f l)
(list->set (filter (lambda (claim-rr) (in-bailiwick? (rr-name claim-rr) zone-origin)) l)))
;; Here's where we do the "lame referral" check. This code is
;; nice and simple (though wrong) without it. Ho hum.
(define answers (f (dns-message-answers message)))
(define unfiltered-authorities (dns-message-authorities message))
(define non-subzone-ns-rrs ;; all NS authorities not for a STRICT subzone of our zone-origin
(filter (lambda (rr) (and (eqv? (rdata-type (rr-rdata rr)) 'ns)
(or (equal? (rr-name rr) zone-origin)
(not (in-bailiwick? (rr-name rr) zone-origin)))))
unfiltered-authorities))
(define authorities (f unfiltered-authorities))
(define answers-to-q ;; answers specifically to the question we asked
(set-filter (lambda (rr) (equal? (rr-name rr) (question-name q))) answers))
(define lame?
(and (set-empty? (filter-by-type answers-to-q 'cname))
(set-empty? (filter-rrs answers-to-q (question-type q) (question-class q)))
(set-empty? (filter-by-type authorities 'soa))
(not (null? non-subzone-ns-rrs))))
(if lame?
'lame-delegation
(complete-answer answers
authorities
(f (dns-message-additional message))))]
[(name-error) #f]
[else
(log-info "Abnormal response-code ~v in response to questions ~v"
(dns-message-response-code message)
(dns-message-questions message))
'bad-answer]))
;; (: ip->host-name : IPv4 -> String)
(define (ip->host-name ip-address)
(match-define (vector a b c d) ip-address)
(format "~a.~a.~a.~a" a b c d))
;; (: make-dns-address : IPv4 -> UdpAddress)
(define (make-dns-address ip-address)
(udp-remote-address (ip->host-name ip-address) 53))
;; (: network-query : (All (ParentState)
;; UdpAddress Question DomainName (Listof DomainName) Any ->
;; Void))
(define (network-query s q zone-origin server-names unique-id)
(spawn*
#:name (list 'network-query q)
(field [timeout first-timeout]
[known-addresses #hash()] ;; Hash DomainName (Listof UdpAddress)
[remaining-addresses '()] ;; Listof UdpAddress
[current-name #f] ;; Option DomainName
[remaining-names server-names]) ;; Listof DomainName
(define (on-answer ans server-ip)
(match ans
['bad-answer ;; can come from filter-dns-reply
(try-next-server)]
['lame-delegation ;; can come from filter-dns-reply
(log-info "Lame delegation received from ~v (at ~v) in bailiwick ~v in response to ~v"
(current-name)
server-ip
zone-origin
q)
(when (and (current-name) server-ip)
;; Actually remove the offending IP address so it's never tried again.
(known-addresses (hash-update (known-addresses)
(current-name)
(lambda (addrs) (remove server-ip addrs)))))
(try-next-server)]
[(and (or (? complete-answer?) #f) ans)
(send! (network-reply unique-id ans))]))
(define (try-next-server)
(if (not (timeout))
;; No more timeouts to try, so give up.
(on-answer (empty-complete-answer) #f)
(match (remaining-addresses)
['()
(match (remaining-names)
['()
;; No more addresses to try with this timeout. Refill the list
;; and bump the timeout and retry.
;; TODO: randomize ordering of servers in list. (Don't forget the initial fill.)
(timeout (next-timeout (timeout)))
(current-name #f)
(remaining-names server-names)
(try-next-server)]
[(cons next-name new-remaining-names)
(current-name next-name)
(remaining-names new-remaining-names)
(if (hash-has-key? (known-addresses) next-name)
(begin (remaining-addresses (hash-ref (known-addresses) (current-name)))
(try-next-server))
(let ((subq (ns-question next-name q)))
(react (on-start (send! subq))
(stop-when (message (answered-question subq $ans))
(define ips
(for/list [(a (extract-addresses next-name ans))]
(make-dns-address a)))
(known-addresses (hash-set (known-addresses) next-name ips))
(remaining-addresses ips)
(try-next-server)))))])]
[(cons current-ip new-remaining-ips)
(remaining-addresses new-remaining-ips)
(define rpc-id (gensym 'network-query/allocate-query-id))
(react (on-start (send! `(request ,rpc-id allocate-query-id)))
(stop-when (message `(reply ,rpc-id ,(? exact-nonnegative-integer? $id)))
(remaining-addresses new-remaining-ips)
(send-request id current-ip)))])))
(define (send-request query-id server-ip)
(define query (make-dns-query-message q query-id))
(define reply-wait-id (list s query-id 'reply-wait))
(define timeout-id (list s query-id 'timeout))
(define start-time (current-inexact-milliseconds))
(log-debug "Sending ~v ~v to ~v ~v with ~v seconds of timeout"
q query-id
zone-origin server-ip
(timeout))
;; NB: ANALYSIS: Previous implementation of this used a
;; quasi-join, where one endpoint deleted the other. Here the two
;; stop-when clauses do a similar job. Also, we can pull the
;; `release-query-id` send up to an on-stop clause.
(react (on-start
(send! (dns-request query s server-ip))
(send! (set-timer timeout-id (* (timeout) 1000) 'relative)))
(on-stop
(send! (list 'release-query-id query-id)))
(stop-when (message (timer-expired timeout-id _))
(log-debug "Timed out ~v ~v to ~v ~v after ~v seconds"
q query-id
zone-origin server-ip
(timeout))
(try-next-server))
(stop-when (message (dns-reply
(? (lambda (m) (= (dns-message-id m)
(dns-message-id query)))
$reply-message)
$source
s))
;; TODO: maybe receive only specifically from the queried IP address?
(log-debug "Answer to ~v from ~v ~v in ~v ms~n-- Answers: ~v~n-- Authorities: ~v~n-- Additional: ~v"
q zone-origin server-ip
(inexact->exact (round (- (current-inexact-milliseconds) start-time)))
(dns-message-answers reply-message)
(dns-message-authorities reply-message)
(dns-message-additional reply-message))
(on-answer (filter-dns-reply q reply-message zone-origin) server-ip))))
(try-next-server)))
(define ((dns-message-id-matches? expected) m)
(= (dns-message-id m) expected))